swh:1:snp:eb70f1f85391e4b077c211bec36af0061c4bf937
Raw File
Tip revision: b0b767b91ca077a14368eaac1f98120261d7460c authored by Antoine R. Dumont (@ardumont) on 30 April 2020, 11:20:08 UTC
pg: Write both origin visit updates & status, read from origin_visit
Tip revision: b0b767b
origin.py
# Copyright (C) 2019  The Software Heritage developers
# See the AUTHORS file at the top-level directory of this distribution
# License: GNU General Public License version 3, or any later version
# See top-level LICENSE file for more information


def iter_origins(storage, origin_from=1, origin_to=None, batch_size=10000):
    """Iterates over all origins in the storage.

    Args:
        storage: the storage object used for queries.
        batch_size: number of origins per query
    Yields:
        dict: the origin dictionary with the keys:

        - type: origin's type
        - url: origin's url
    """
    start = origin_from
    while True:
        if origin_to:
            origin_count = min(origin_to - start, batch_size)
        else:
            origin_count = batch_size
        origins = list(
            storage.origin_get_range(origin_from=start, origin_count=origin_count)
        )
        if not origins:
            break
        start = origins[-1]["id"] + 1
        for origin in origins:
            del origin["id"]
            yield origin
        if origin_to and start > origin_to:
            break
back to top