Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 37 additions & 14 deletions install/helioviewer/hvpull/servers/__init__.py
Original file line number Diff line number Diff line change
@@ -1,30 +1,57 @@
"""Classes for working with known data servers"""
import os
import re
import datetime


def get_datetime_from_file(filename):
"""Extract datetime from filename using regex matching for date formats '%Y_%m_%d__%H_%M_%S', '%Y%m%dT%H%M%S', or '%Y%m%d%H%M%S'"""
url_filename = os.path.basename(filename)

# Try format: YYYY_MM_DD__HH_MM_SS
match = re.search(r'(\d{4}_\d{2}_\d{2}__\d{2}_\d{2}_\d{2})', url_filename)
if match:
url_datetime = match.group(1)
return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S')

# Try format: YYYYMMDDTHHMMSS
match = re.search(r'(\d{8}T\d{6})', url_filename)
if match:
url_datetime = match.group(1)
return datetime.datetime.strptime(url_datetime, '%Y%m%dT%H%M%S')

# Try format: YYYYMMDDHHMMSS
match = re.search(r'(\d{14})', url_filename)
if match:
url_datetime = match.group(1)
return datetime.datetime.strptime(url_datetime, '%Y%m%d%H%M%S')

raise ValueError(f"No valid datetime format found in filename: {filename}")


class DataServer:
"""Class for interacting with data servers."""
def __init__(self, uri, name, pause=3):
self.uri = uri
self.name = name
self.pause = datetime.timedelta(minutes=pause)

# Example: 2011_11_17__08_13_08_13__SDO_AIA_AIA_304.jp2
self.filename_regex = (
"^(?P<year>\d{4})_(?P<month>\d{2})_(?P<day>\d{2})__" +
"(?P<hour>\d{2})_(?P<min>\d{2})_(?P<sec>\d{2})_" +
"(?P<hour>\d{2})_(?P<min>\d{2})_(?P<sec>\d{2})_" +
"(?P<microsec>\d{2,3})__" +
"(?P<obs>[a-zA-Z0-9]{3})_(?P<inst>[a-zA-Z0-9]{3})_" +
"(?P<det>[a-zA-Z0-9]{3})_(?P<meas>[a-zA-Z0-9]{2,11})\.jp2$")

def compute_directories(self, start_date, end_date):
"""Creates a list of possible directories containing new files"""
return []

def get_starttime(self):
"""Default start time to use when retrieving data"""
return datetime.datetime.utcnow() - datetime.timedelta(hours=6)

def get_dates(self, starttime, endtime):
"""Get a complete list of dates between the start and the end time"""
fmt = "%Y/%m/%d"
Expand All @@ -34,13 +61,13 @@ def get_dates(self, starttime, endtime):
while date < endtime.date():
date = date + datetime.timedelta(days=1)
dates.append(date.strftime(fmt))

# Ensure the dates are most recent first
dates.sort()
dates.reverse()

return dates

def get_file_regex(self):
"""Returns a regex which described the expected format of filenames on
the server"""
Expand All @@ -49,15 +76,13 @@ def get_file_regex(self):
def get_measurements(self, nicknames, dates):
"""Get a list of all the URIs down to the measurement"""
return None

def get_uri(self):
"""Return the server URI"""
return self.uri

def get_datetime_from_file(self, filename):
url_filename = os.path.basename(filename)
url_datetime = url_filename[0:20]
return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S')
return get_datetime_from_file(filename)


class DataServerPauseDelayDefinesDefaultStartTime:
Expand Down Expand Up @@ -115,7 +140,5 @@ def get_uri(self):
return self.uri

def get_datetime_from_file(self, filename):
url_filename = os.path.basename(filename)
url_datetime = url_filename[0:20]
return datetime.datetime.strptime(url_datetime, '%Y_%m_%d__%H_%M_%S')
return get_datetime_from_file(filename)

41 changes: 24 additions & 17 deletions install/helioviewer/hvpull/servers/hv.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,28 +16,35 @@ def compute_directories(self, start_date, end_date):
# Start with date directories
for date in self.get_dates(start_date, end_date):
date_url = os.path.join(self.uri, date)
# Recursively enumerate subdirectories starting from date URL
dirs.extend(self._enumerate_subdirectories(date_url))

# Query the URL to find subdirectories
try:
response = requests.get(date_url)
response.raise_for_status()
return dirs

# Extract subdirectory links from HTML
subdirs = self._parse_directory_links(response.content.decode('utf-8'))
def _enumerate_subdirectories(self, url):
"""Recursively enumerate subdirectories by querying the URL"""
try:
response = requests.get(url)
response.raise_for_status()

if subdirs:
# Add each subdirectory with date_url as prefix
for subdir in subdirs:
dirs.append(f"{date_url}/{subdir}")
else:
# No subdirectories found, add the date URL itself
dirs.append(date_url)
# Extract subdirectory links from HTML
subdirs = self._parse_directory_links(response.content.decode('utf-8'))

except requests.RequestException:
# If we can't query the URL, add it as-is
dirs.append(date_url)
if not subdirs:
# No subdirectories found, this is a leaf directory
return [url]

return dirs
# Recursively enumerate each subdirectory
all_dirs = []
for subdir in subdirs:
subdir_url = f"{url}/{subdir}"
all_dirs.extend(self._enumerate_subdirectories(subdir_url))

return all_dirs

except requests.RequestException:
# If we can't query the URL, return it as a leaf directory
return [url]

def _parse_directory_links(self, html):
"""Parse HTML content and extract directory links"""
Expand Down
Loading