""" Date Filtering Module Handles date parsing and filtering for articles and documents """ import logging from datetime import datetime from typing import Optional import re from dateutil import parser as date_parser # Configure logging logger = logging.getLogger(__name__) def parse_article_date(date_str: str) -> Optional[datetime]: """ Parse article date string into datetime object Handles various date formats commonly found in scraped articles Args: date_str: Date string to parse Returns: datetime object if parsing successful, None otherwise """ if not date_str or not date_str.strip(): return None date_str = date_str.strip() # Try to clean up common prefixes date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE) date_str = date_str.strip() # Try various parsing strategies try: # Strategy 1: Use dateutil parser (handles most formats) try: parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0)) logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date}") return parsed_date except (ValueError, TypeError) as e: logger.debug(f"⚠️ dateutil parser failed for '{date_str}': {str(e)}") # Strategy 2: Try common ISO format patterns iso_patterns = [ r'(\d{4}-\d{2}-\d{2})', # YYYY-MM-DD r'(\d{4}/\d{2}/\d{2})', # YYYY/MM/DD r'(\d{2}-\d{2}-\d{4})', # DD-MM-YYYY r'(\d{2}/\d{2}/\d{4})', # DD/MM/YYYY ] for pattern in iso_patterns: match = re.search(pattern, date_str) if match: date_part = match.group(1) try: # Try parsing with different separators if '-' in date_part: parts = date_part.split('-') elif '/' in date_part: parts = date_part.split('/') else: continue if len(parts[0]) == 4: # YYYY-MM-DD or YYYY/MM/DD year, month, day = int(parts[0]), int(parts[1]), int(parts[2]) parsed_date = datetime(year, month, day) logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern") return parsed_date elif len(parts[2]) == 4: # DD-MM-YYYY or DD/MM/YYYY day, month, year = int(parts[0]), int(parts[1]), int(parts[2]) parsed_date = datetime(year, month, day) logger.debug(f"✅ Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern") return parsed_date except (ValueError, IndexError) as e: logger.debug(f"⚠️ Failed to parse date part '{date_part}': {str(e)}") continue logger.warning(f"⚠️ Could not parse date string: '{date_str}'") return None except Exception as e: logger.error(f"❌ Unexpected error parsing date '{date_str}': {str(e)}") return None def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]: """ Standardize a date string to YYYY-MM-DD format for consistent storage and filtering. This function takes a date string in any format, parses it, and returns it in a standardized YYYY-MM-DD format that can be used with the date filter. Args: date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15") default_to_current: If True, return current date when parsing fails. If False, return None. Returns: Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True) Examples: >>> standardize_date("January 15, 2024") '2024-01-15' >>> standardize_date("Posted on 2024-01-15") '2024-01-15' >>> standardize_date("15/01/2024") '2024-01-15' >>> standardize_date("invalid date") None >>> standardize_date("invalid date", default_to_current=True) '2025-01-07' # Current date """ if not date_str or not date_str.strip(): if default_to_current: return datetime.now().strftime("%Y-%m-%d") return None # Parse the date string parsed_date = parse_article_date(date_str) if parsed_date is None: if default_to_current: logger.warning(f"⚠️ Could not parse date '{date_str}', using current date") return datetime.now().strftime("%Y-%m-%d") logger.debug(f"⚠️ Could not standardize date '{date_str}'") return None # Return standardized format standardized = parsed_date.strftime("%Y-%m-%d") logger.debug(f"✅ Standardized date '{date_str}' to '{standardized}'") return standardized def parse_date_input(date_input: str) -> Optional[datetime]: """ Parse date input from UI (expected to be in YYYY-MM-DD format) Args: date_input: Date string from UI input (YYYY-MM-DD format) Returns: datetime object if parsing successful, None otherwise """ if not date_input or not date_input.strip(): return None date_input = date_input.strip() try: # Try parsing as YYYY-MM-DD parsed_date = datetime.strptime(date_input, "%Y-%m-%d") logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date}") return parsed_date except ValueError: try: # Try using dateutil as fallback parsed_date = date_parser.parse(date_input, fuzzy=False) logger.debug(f"✅ Successfully parsed date input '{date_input}' to {parsed_date} using dateutil") return parsed_date except (ValueError, TypeError) as e: logger.warning(f"⚠️ Could not parse date input '{date_input}': {str(e)}") return None def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool: """ Check if article date falls within the selected date range Args: article_date_str: Article date as string start_date: Start date of range (inclusive), None if no start date end_date: End date of range (inclusive), None if no end date include_missing: If True, include articles with missing/invalid dates. If False, exclude them. Returns: True if article date is in range (or if no date range provided), False otherwise """ # If no date range provided, include all articles if start_date is None and end_date is None: return True # Try to parse article date article_date = parse_article_date(article_date_str) # Handle missing/invalid dates if article_date is None: logger.debug(f"⚠️ Could not parse article date '{article_date_str}', include_missing={include_missing}") return include_missing # Check if date is within range in_range = True if start_date is not None: # Normalize to start of day for comparison start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0) article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0) if article_normalized < start_normalized: in_range = False logger.debug(f"📅 Article date {article_normalized} is before start date {start_normalized}") if end_date is not None and in_range: # Normalize to end of day for comparison end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999) article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0) if article_normalized > end_normalized: in_range = False logger.debug(f"📅 Article date {article_normalized} is after end date {end_normalized}") if in_range: logger.debug(f"✅ Article date {article_date} is within range [{start_date}, {end_date}]") return in_range