From ff29c1ff56a64a8448edeca04e105777031b898e Mon Sep 17 00:00:00 2001 From: szucsitg <32290917+szucsitg@users.noreply.github.com> Date: Mon, 22 Sep 2025 13:34:15 +0000 Subject: [PATCH] feat: handle exceptions and errors gracefully Signed-off-by: szucsitg <32290917+szucsitg@users.noreply.github.com> --- action.yml | 2 +- kernel_crawler/bottlerocket.py | 2 +- kernel_crawler/crawler.py | 51 ++++++++++++++--------- kernel_crawler/deb.py | 2 +- kernel_crawler/flatcar.py | 2 +- kernel_crawler/main.py | 12 ++++-- kernel_crawler/rpm.py | 9 ++-- kernel_crawler/utils/download.py | 71 +++++++++++++++++++++----------- 8 files changed, 97 insertions(+), 54 deletions(-) diff --git a/action.yml b/action.yml index 4de397b..b5863d4 100644 --- a/action.yml +++ b/action.yml @@ -35,7 +35,7 @@ runs: shell: bash working-directory: ${{ github.action_path }} run: | - kernel-crawler crawl --distro=${{ inputs.distro }} --arch=${{ inputs.arch }} > ${{ runner.temp }}/kernels_${{ inputs.arch }}.json + kernel-crawler crawl --distro=${{ inputs.distro }} --arch=${{ inputs.arch }} --output ${{ runner.temp }}/kernels_${{ inputs.arch }}.json - name: Validate json shell: bash diff --git a/kernel_crawler/bottlerocket.py b/kernel_crawler/bottlerocket.py index 5206eaa..ea171d7 100644 --- a/kernel_crawler/bottlerocket.py +++ b/kernel_crawler/bottlerocket.py @@ -32,7 +32,7 @@ def fetch_base_config(self, kverspec): if source is None: return None - alkernel = requests.get(source) + alkernel = requests.get(source, timeout = 15) alkernel.raise_for_status() with open('/tmp/alkernel.rpm', 'wb') as f: f.write(alkernel.content) diff --git a/kernel_crawler/crawler.py b/kernel_crawler/crawler.py index 92190d5..6199a1b 100644 --- a/kernel_crawler/crawler.py +++ b/kernel_crawler/crawler.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from requests.exceptions import ConnectTimeout, ReadTimeout, Timeout, RequestException, ConnectionError from . import repo from .minikube import MinikubeMirror from .aliyunlinux import AliyunLinuxMirror @@ -84,25 +85,37 @@ def crawl_kernels(distro, version, arch, images): for distname, dist in DISTROS.items(): if distname == distro or distro == "*": - # If the distro requires an image (Redhat only so far), we need to amalgamate - # the kernel versions from the supplied images before choosing the output. - if issubclass(dist, repo.ContainerDistro): - if images: - kv = {} - for image in images: - d = dist(image) - if len(kv) == 0: - kv = d.get_kernel_versions() - else: - kv.update(d.get_kernel_versions()) - # We should now have a list of all kernel versions for the supplied images - res = kv + try: + # If the distro requires an image (Redhat only so far), we need to amalgamate + # the kernel versions from the supplied images before choosing the output. + if issubclass(dist, repo.ContainerDistro): + if images: + kv = {} + for image in images: + d = dist(image) + if len(kv) == 0: + kv = d.get_kernel_versions() + else: + kv.update(d.get_kernel_versions()) + # We should now have a list of all kernel versions for the supplied images + res = kv + else: + d = None else: - d = None - else: - d = dist(arch) - res = d.get_package_tree(version) + d = dist(arch) + res = d.get_package_tree(version) + + if d and res: + ret[distname] = to_driverkit_config(d, res) + + except (ConnectTimeout, ReadTimeout, Timeout): + print(f"[ERROR] Timeout while fetching data for distro '{distname}'") + except ConnectionError: + print(f"[ERROR] Network unreachable or host down for distro '{distname}'") + except RequestException as e: + print(f"[ERROR] Request failed for distro '{distname}': {e}") + except Exception as e: + # Catch-all for unexpected issues + print(f"[ERROR] Unexpected error in distro '{distname}': {e}") - if d and res: - ret[distname] = to_driverkit_config(d, res) return ret diff --git a/kernel_crawler/deb.py b/kernel_crawler/deb.py index 890420b..86a56e2 100644 --- a/kernel_crawler/deb.py +++ b/kernel_crawler/deb.py @@ -276,7 +276,7 @@ def scan_repo(self, dist): def list_repos(self): dists_url = self.base_url + 'dists/' - dists = requests.get(dists_url) + dists = requests.get(dists_url, timeout = 15) dists.raise_for_status() dists = dists.content doc = html.fromstring(dists, dists_url) diff --git a/kernel_crawler/flatcar.py b/kernel_crawler/flatcar.py index 8ec17e3..1c3fc65 100644 --- a/kernel_crawler/flatcar.py +++ b/kernel_crawler/flatcar.py @@ -47,7 +47,7 @@ def __init__(self, arch): def scan_repo(self, base_url): try: - dists = requests.get(base_url) + dists = requests.get(base_url, timeout = 15) dists.raise_for_status() except requests.exceptions.RequestException: return {} diff --git a/kernel_crawler/main.py b/kernel_crawler/main.py index 507f106..08e8f39 100644 --- a/kernel_crawler/main.py +++ b/kernel_crawler/main.py @@ -54,14 +54,20 @@ def handle_parse_result(self, ctx, opts, args): return super(DistroImageValidation, self).handle_parse_result(ctx, opts, args) @click.command() -@click.option('--distro', type=click.Choice(sorted(list(DISTROS.keys())) + ['*'], case_sensitive=True)) +@click.option('--distro', type=click.Choice(sorted(list(DISTROS.keys())) + ['*'], case_sensitive=True), required=True) @click.option('--version', required=False, default='') @click.option('--arch', required=False, type=click.Choice(['x86_64', 'aarch64'], case_sensitive=True), default='x86_64') @click.option('--image', cls=DistroImageValidation, required_if_distro=["Redhat"], multiple=True) -def crawl(distro, version='', arch='', image=''): +@click.option('--output', type=click.Path(dir_okay=False, writable=True), help="Optional file path to write JSON output") +def crawl(distro, version='', arch='', image='', output=None): res = crawl_kernels(distro, version, arch, image) json_object = json.dumps(res, indent=2, default=vars) - print(json_object) + if output: + with open(output, 'w', encoding='utf-8') as f: + f.write(json_object) + click.echo(f"[INFO] JSON output written to {output}") + else: + click.echo(json_object) cli.add_command(crawl, 'crawl') diff --git a/kernel_crawler/rpm.py b/kernel_crawler/rpm.py index 41ec202..555b779 100644 --- a/kernel_crawler/rpm.py +++ b/kernel_crawler/rpm.py @@ -133,7 +133,8 @@ def dist_exists(self, dist): self.dist_url(dist), headers={ # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated 'user-agent': 'dummy' - } + }, + timeout = 15 ) r.raise_for_status() except requests.exceptions.RequestException: @@ -145,7 +146,8 @@ def list_repos(self): self.base_url, headers={ # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated 'user-agent': 'dummy' - } + }, + timeout = 15 ) dists.raise_for_status() dists = dists.content @@ -185,7 +187,8 @@ def list_repos(self): self.base_url, headers={ # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated 'user-agent': 'dummy' - } + }, + timeout = 15 ) dists.raise_for_status() dists = dists.content diff --git a/kernel_crawler/utils/download.py b/kernel_crawler/utils/download.py index 2d05da7..fc3f180 100644 --- a/kernel_crawler/utils/download.py +++ b/kernel_crawler/utils/download.py @@ -9,32 +9,53 @@ except ImportError: from backports import lzma +from requests.exceptions import ( + ConnectTimeout, + ReadTimeout, + Timeout, + ConnectionError, + RequestException, +) + + def get_url(url): - resp = requests.get( - url, - headers={ # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated - 'user-agent': 'dummy' - } - ) - - # if 404, silently fail - if resp.status_code == 404: - return None - else: # if any other error, raise the error - might be a bug in crawler - resp.raise_for_status() - - # if no error, return the (eventually decompressed) contents - if url.endswith('.gz'): - return zlib.decompress(resp.content, 47) - elif url.endswith('.xz'): - return lzma.decompress(resp.content) - elif url.endswith('.bz2'): - return bz2.decompress(resp.content) - elif url.endswith('.zst'): - with zstandard.ZstdDecompressor().stream_reader(io.BytesIO(resp.content)) as rr: - return rr.read() - else: - return resp.content + try: + resp = requests.get( + url, + headers={ # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated + 'user-agent': 'dummy' + }, + timeout=15, + ) + + # if 404, silently fail + if resp.status_code == 404: + return None + else: # if any other error, raise the error - might be a bug in crawler + resp.raise_for_status() + + # if no error, return the (eventually decompressed) contents + if url.endswith('.gz'): + return zlib.decompress(resp.content, 47) + elif url.endswith('.xz'): + return lzma.decompress(resp.content) + elif url.endswith('.bz2'): + return bz2.decompress(resp.content) + elif url.endswith('.zst'): + with zstandard.ZstdDecompressor().stream_reader(io.BytesIO(resp.content)) as rr: + return rr.read() + else: + return resp.content + + except (ConnectTimeout, ReadTimeout, Timeout): + print(f"[ERROR] Timeout fetching {url}") + except ConnectionError: + print(f"[ERROR] Network unreachable or host down: {url}") + except RequestException as e: + print(f"[ERROR] Request failed for {url}: {e}") + except Exception as e: + print(f"[ERROR] Unexpected error fetching {url}: {e}") + return None def get_first_of(urls):