Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 5 additions & 23 deletions kernel_crawler/rpm.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,11 @@
import sqlite3
import tempfile
import re
import zstandard as zstd
import io

from . import repo
from kernel_crawler.utils.download import get_url

try:
import lzma
except ImportError:
from backports import lzma


class RpmRepository(repo.Repository):
def __init__(self, base_url):
self.base_url = base_url
Expand Down Expand Up @@ -258,25 +251,17 @@ def build_kernel_devel_noarch_url(self, kernel_release):
'''
return f'{self.base_url}noarch/kernel-devel-{kernel_release}.rpm'.replace(self.arch, 'noarch')

def open_repo(self, repo_path, isZstd):
def open_repo(self, repo_path):
package_match = f'{self.arch}/{self._kernel_devel_pattern}'
# regex searching through a file is more memory efficient
# than parsing the xml into an object structure with lxml etree
open_mode = 'r'
if isZstd:
open_mode = 'rb'
with open(repo_path, mode=open_mode) as f:
if isZstd:
dctx = zstd.ZstdDecompressor(max_window_size=2147483648)
stream_reader = dctx.stream_reader(f)
text = io.TextIOWrapper(stream_reader, encoding='utf-8').read()
else:
text = str(f.read())

with open(repo_path, mode='r') as f:
text = str(f.read())
search = re.search(f'.*href="({package_match}.*rpm)', text)
if search:
return search.group(1)
return None
return None

def get_package_tree(self, filter=''):
'''
Expand All @@ -300,10 +285,7 @@ def get_package_tree(self, filter=''):
with tempfile.NamedTemporaryFile() as tf:
tf.write(repodb)
tf.flush()
try:
kernel_default_devel_pkg_url = self.open_repo(tf.name, False)
except UnicodeDecodeError:
kernel_default_devel_pkg_url = self.open_repo(tf.name, True)
kernel_default_devel_pkg_url = self.open_repo(tf.name)
tf.close() # delete the tempfile to free up memory

# check to ensure a kernel_devel_pkg was found
Expand Down
7 changes: 6 additions & 1 deletion kernel_crawler/utils/download.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import bz2
import zlib
import zstandard
import requests
import io

try:
import lzma
Expand All @@ -21,13 +23,16 @@ def get_url(url):
else: # if any other error, raise the error - might be a bug in crawler
resp.raise_for_status()

# if no error, return the contents
# if no error, return the (eventually decompressed) contents
if url.endswith('.gz'):
return zlib.decompress(resp.content, 47)
elif url.endswith('.xz'):
return lzma.decompress(resp.content)
elif url.endswith('.bz2'):
return bz2.decompress(resp.content)
elif url.endswith('.zst'):
with zstandard.ZstdDecompressor().stream_reader(io.BytesIO(resp.content)) as rr:
return rr.read()
else:
return resp.content

Expand Down
Loading