From ff29c1ff56a64a8448edeca04e105777031b898e Mon Sep 17 00:00:00 2001
From: szucsitg <32290917+szucsitg@users.noreply.github.com>
Date: Mon, 22 Sep 2025 13:34:15 +0000
Subject: [PATCH] feat: handle exceptions and errors gracefully

Signed-off-by: szucsitg <32290917+szucsitg@users.noreply.github.com>
---
 action.yml                       |  2 +-
 kernel_crawler/bottlerocket.py   |  2 +-
 kernel_crawler/crawler.py        | 51 ++++++++++++++---------
 kernel_crawler/deb.py            |  2 +-
 kernel_crawler/flatcar.py        |  2 +-
 kernel_crawler/main.py           | 12 ++++--
 kernel_crawler/rpm.py            |  9 ++--
 kernel_crawler/utils/download.py | 71 +++++++++++++++++++++-----------
 8 files changed, 97 insertions(+), 54 deletions(-)

diff --git a/action.yml b/action.yml
index 4de397b..b5863d4 100644
--- a/action.yml
+++ b/action.yml
@@ -35,7 +35,7 @@ runs:
       shell: bash
       working-directory: ${{ github.action_path }}
       run: |
-        kernel-crawler crawl --distro=${{ inputs.distro }} --arch=${{ inputs.arch }} > ${{ runner.temp }}/kernels_${{ inputs.arch }}.json
+        kernel-crawler crawl --distro=${{ inputs.distro }} --arch=${{ inputs.arch }} --output ${{ runner.temp }}/kernels_${{ inputs.arch }}.json
 
     - name: Validate json
       shell: bash
diff --git a/kernel_crawler/bottlerocket.py b/kernel_crawler/bottlerocket.py
index 5206eaa..ea171d7 100644
--- a/kernel_crawler/bottlerocket.py
+++ b/kernel_crawler/bottlerocket.py
@@ -32,7 +32,7 @@ def fetch_base_config(self, kverspec):
         if source is None:
             return None
 
-        alkernel = requests.get(source)
+        alkernel = requests.get(source, timeout = 15)
         alkernel.raise_for_status()
         with open('/tmp/alkernel.rpm', 'wb') as f:
             f.write(alkernel.content)
diff --git a/kernel_crawler/crawler.py b/kernel_crawler/crawler.py
index 92190d5..6199a1b 100644
--- a/kernel_crawler/crawler.py
+++ b/kernel_crawler/crawler.py
@@ -11,6 +11,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from requests.exceptions import ConnectTimeout, ReadTimeout, Timeout, RequestException, ConnectionError
 from . import repo
 from .minikube import MinikubeMirror
 from .aliyunlinux import AliyunLinuxMirror
@@ -84,25 +85,37 @@ def crawl_kernels(distro, version, arch, images):
 
     for distname, dist in DISTROS.items():
         if distname == distro or distro == "*":
-            # If the distro requires an image (Redhat only so far), we need to amalgamate
-            # the kernel versions from the supplied images before choosing the output.
-            if issubclass(dist, repo.ContainerDistro):
-                if images:
-                    kv = {}
-                    for image in images:
-                        d = dist(image)
-                        if len(kv) == 0:
-                            kv = d.get_kernel_versions()
-                        else:
-                            kv.update(d.get_kernel_versions())
-                    # We should now have a list of all kernel versions for the supplied images
-                    res = kv
+            try:
+                # If the distro requires an image (Redhat only so far), we need to amalgamate
+                # the kernel versions from the supplied images before choosing the output.
+                if issubclass(dist, repo.ContainerDistro):
+                    if images:
+                        kv = {}
+                        for image in images:
+                            d = dist(image)
+                            if len(kv) == 0:
+                                kv = d.get_kernel_versions()
+                            else:
+                                kv.update(d.get_kernel_versions())
+                        # We should now have a list of all kernel versions for the supplied images
+                        res = kv
+                    else:
+                        d = None
                 else:
-                    d = None
-            else:
-                d = dist(arch)
-                res = d.get_package_tree(version)
+                    d = dist(arch)
+                    res = d.get_package_tree(version)
+
+                if d and res:
+                    ret[distname] = to_driverkit_config(d, res)
+
+            except (ConnectTimeout, ReadTimeout, Timeout):
+                print(f"[ERROR] Timeout while fetching data for distro '{distname}'")
+            except ConnectionError:
+                print(f"[ERROR] Network unreachable or host down for distro '{distname}'")
+            except RequestException as e:
+                print(f"[ERROR] Request failed for distro '{distname}': {e}")
+            except Exception as e:
+                # Catch-all for unexpected issues
+                print(f"[ERROR] Unexpected error in distro '{distname}': {e}")
 
-            if d and res:
-                ret[distname] = to_driverkit_config(d, res)
     return ret
diff --git a/kernel_crawler/deb.py b/kernel_crawler/deb.py
index 890420b..86a56e2 100644
--- a/kernel_crawler/deb.py
+++ b/kernel_crawler/deb.py
@@ -276,7 +276,7 @@ def scan_repo(self, dist):
 
     def list_repos(self):
         dists_url = self.base_url + 'dists/'
-        dists = requests.get(dists_url)
+        dists = requests.get(dists_url, timeout = 15)
         dists.raise_for_status()
         dists = dists.content
         doc = html.fromstring(dists, dists_url)
diff --git a/kernel_crawler/flatcar.py b/kernel_crawler/flatcar.py
index 8ec17e3..1c3fc65 100644
--- a/kernel_crawler/flatcar.py
+++ b/kernel_crawler/flatcar.py
@@ -47,7 +47,7 @@ def __init__(self, arch):
 
     def scan_repo(self, base_url):
         try:
-            dists = requests.get(base_url)
+            dists = requests.get(base_url, timeout = 15)
             dists.raise_for_status()
         except requests.exceptions.RequestException:
             return {}
diff --git a/kernel_crawler/main.py b/kernel_crawler/main.py
index 507f106..08e8f39 100644
--- a/kernel_crawler/main.py
+++ b/kernel_crawler/main.py
@@ -54,14 +54,20 @@ def handle_parse_result(self, ctx, opts, args):
         return super(DistroImageValidation, self).handle_parse_result(ctx, opts, args)
 
 @click.command()
-@click.option('--distro', type=click.Choice(sorted(list(DISTROS.keys())) + ['*'], case_sensitive=True))
+@click.option('--distro', type=click.Choice(sorted(list(DISTROS.keys())) + ['*'], case_sensitive=True), required=True)
 @click.option('--version', required=False, default='')
 @click.option('--arch', required=False, type=click.Choice(['x86_64', 'aarch64'], case_sensitive=True), default='x86_64')
 @click.option('--image', cls=DistroImageValidation, required_if_distro=["Redhat"], multiple=True)
-def crawl(distro, version='', arch='', image=''):
+@click.option('--output', type=click.Path(dir_okay=False, writable=True), help="Optional file path to write JSON output")
+def crawl(distro, version='', arch='', image='', output=None):
     res = crawl_kernels(distro, version, arch, image)
     json_object = json.dumps(res, indent=2, default=vars)
-    print(json_object)
+    if output:
+        with open(output, 'w', encoding='utf-8') as f:
+            f.write(json_object)
+        click.echo(f"[INFO] JSON output written to {output}")
+    else:
+        click.echo(json_object)
 
 cli.add_command(crawl, 'crawl')
 
diff --git a/kernel_crawler/rpm.py b/kernel_crawler/rpm.py
index 41ec202..555b779 100644
--- a/kernel_crawler/rpm.py
+++ b/kernel_crawler/rpm.py
@@ -133,7 +133,8 @@ def dist_exists(self, dist):
                 self.dist_url(dist),
                 headers={  # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated
                     'user-agent': 'dummy'
-                }
+                },
+                timeout = 15
             )
             r.raise_for_status()
         except requests.exceptions.RequestException:
@@ -145,7 +146,8 @@ def list_repos(self):
             self.base_url, 
             headers={  # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated
                 'user-agent': 'dummy'
-            }
+            },
+            timeout = 15
         )
         dists.raise_for_status()
         dists = dists.content
@@ -185,7 +187,8 @@ def list_repos(self):
             self.base_url,
             headers={  # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated
                 'user-agent': 'dummy'
-            }
+            },
+            timeout = 15
         )
         dists.raise_for_status()
         dists = dists.content
diff --git a/kernel_crawler/utils/download.py b/kernel_crawler/utils/download.py
index 2d05da7..fc3f180 100644
--- a/kernel_crawler/utils/download.py
+++ b/kernel_crawler/utils/download.py
@@ -9,32 +9,53 @@
 except ImportError:
     from backports import lzma
 
+from requests.exceptions import (
+    ConnectTimeout,
+    ReadTimeout,
+    Timeout,
+    ConnectionError,
+    RequestException,
+)
+
+
 def get_url(url):
-    resp = requests.get(
-        url,
-        headers={  # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated
-            'user-agent': 'dummy'
-        }
-    )
-
-    # if 404, silently fail
-    if resp.status_code == 404:
-        return None
-    else:  # if any other error, raise the error - might be a bug in crawler
-        resp.raise_for_status()
-
-    # if no error, return the (eventually decompressed) contents
-    if url.endswith('.gz'):
-        return zlib.decompress(resp.content, 47)
-    elif url.endswith('.xz'):
-        return lzma.decompress(resp.content)
-    elif url.endswith('.bz2'):
-        return bz2.decompress(resp.content)
-    elif url.endswith('.zst'):
-        with zstandard.ZstdDecompressor().stream_reader(io.BytesIO(resp.content)) as rr:
-            return rr.read()
-    else:
-        return resp.content
+    try:
+        resp = requests.get(
+            url,
+            headers={  # some URLs require a user-agent, otherwise they return HTTP 406 - this one is fabricated
+                'user-agent': 'dummy'
+            },
+            timeout=15,
+        )
+
+        # if 404, silently fail
+        if resp.status_code == 404:
+            return None
+        else:  # if any other error, raise the error - might be a bug in crawler
+            resp.raise_for_status()
+
+        # if no error, return the (eventually decompressed) contents
+        if url.endswith('.gz'):
+            return zlib.decompress(resp.content, 47)
+        elif url.endswith('.xz'):
+            return lzma.decompress(resp.content)
+        elif url.endswith('.bz2'):
+            return bz2.decompress(resp.content)
+        elif url.endswith('.zst'):
+            with zstandard.ZstdDecompressor().stream_reader(io.BytesIO(resp.content)) as rr:
+                return rr.read()
+        else:
+            return resp.content
+
+    except (ConnectTimeout, ReadTimeout, Timeout):
+        print(f"[ERROR] Timeout fetching {url}")
+    except ConnectionError:
+        print(f"[ERROR] Network unreachable or host down: {url}")
+    except RequestException as e:
+        print(f"[ERROR] Request failed for {url}: {e}")
+    except Exception as e:
+        print(f"[ERROR] Unexpected error fetching {url}: {e}")
+    return None
 
 
 def get_first_of(urls):