File size: 8,579 Bytes
439e1dd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Date Filtering Module
Handles date parsing and filtering for articles and documents
"""

import logging
from datetime import datetime
from typing import Optional
import re
from dateutil import parser as date_parser

# Configure logging
logger = logging.getLogger(__name__)


def parse_article_date(date_str: str) -> Optional[datetime]:
    """
    Parse article date string into datetime object
    Handles various date formats commonly found in scraped articles
    
    Args:
        date_str: Date string to parse
        
    Returns:
        datetime object if parsing successful, None otherwise
    """
    if not date_str or not date_str.strip():
        return None
    
    date_str = date_str.strip()
    
    # Try to clean up common prefixes
    date_str = re.sub(r'^(Posted on|Published on|Date:|Posted:|Published:)\s*', '', date_str, flags=re.IGNORECASE)
    date_str = date_str.strip()
    
    # Try various parsing strategies
    try:
        # Strategy 1: Use dateutil parser (handles most formats)
        try:
            parsed_date = date_parser.parse(date_str, fuzzy=True, default=datetime.now().replace(hour=0, minute=0, second=0, microsecond=0))
            logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date}")
            return parsed_date
        except (ValueError, TypeError) as e:
            logger.debug(f"⚠️ dateutil parser failed for '{date_str}': {str(e)}")
        
        # Strategy 2: Try common ISO format patterns
        iso_patterns = [
            r'(\d{4}-\d{2}-\d{2})',  # YYYY-MM-DD
            r'(\d{4}/\d{2}/\d{2})',  # YYYY/MM/DD
            r'(\d{2}-\d{2}-\d{4})',  # DD-MM-YYYY
            r'(\d{2}/\d{2}/\d{4})',  # DD/MM/YYYY
        ]
        
        for pattern in iso_patterns:
            match = re.search(pattern, date_str)
            if match:
                date_part = match.group(1)
                try:
                    # Try parsing with different separators
                    if '-' in date_part:
                        parts = date_part.split('-')
                    elif '/' in date_part:
                        parts = date_part.split('/')
                    else:
                        continue
                    
                    if len(parts[0]) == 4:  # YYYY-MM-DD or YYYY/MM/DD
                        year, month, day = int(parts[0]), int(parts[1]), int(parts[2])
                        parsed_date = datetime(year, month, day)
                        logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date} using ISO pattern")
                        return parsed_date
                    elif len(parts[2]) == 4:  # DD-MM-YYYY or DD/MM/YYYY
                        day, month, year = int(parts[0]), int(parts[1]), int(parts[2])
                        parsed_date = datetime(year, month, day)
                        logger.debug(f"βœ… Successfully parsed date '{date_str}' to {parsed_date} using DD-MM-YYYY pattern")
                        return parsed_date
                except (ValueError, IndexError) as e:
                    logger.debug(f"⚠️ Failed to parse date part '{date_part}': {str(e)}")
                    continue
        
        logger.warning(f"⚠️ Could not parse date string: '{date_str}'")
        return None
        
    except Exception as e:
        logger.error(f"❌ Unexpected error parsing date '{date_str}': {str(e)}")
        return None


def standardize_date(date_str: str, default_to_current: bool = False) -> Optional[str]:
    """
    Standardize a date string to YYYY-MM-DD format for consistent storage and filtering.
    
    This function takes a date string in any format, parses it, and returns it
    in a standardized YYYY-MM-DD format that can be used with the date filter.
    
    Args:
        date_str: Date string in any format (e.g., "January 15, 2024", "15/01/2024", "Posted on 2024-01-15")
        default_to_current: If True, return current date when parsing fails. If False, return None.
        
    Returns:
        Standardized date string in YYYY-MM-DD format, or None if parsing fails (unless default_to_current=True)
        
    Examples:
        >>> standardize_date("January 15, 2024")
        '2024-01-15'
        >>> standardize_date("Posted on 2024-01-15")
        '2024-01-15'
        >>> standardize_date("15/01/2024")
        '2024-01-15'
        >>> standardize_date("invalid date")
        None
        >>> standardize_date("invalid date", default_to_current=True)
        '2025-01-07'  # Current date
    """
    if not date_str or not date_str.strip():
        if default_to_current:
            return datetime.now().strftime("%Y-%m-%d")
        return None
    
    # Parse the date string
    parsed_date = parse_article_date(date_str)
    
    if parsed_date is None:
        if default_to_current:
            logger.warning(f"⚠️ Could not parse date '{date_str}', using current date")
            return datetime.now().strftime("%Y-%m-%d")
        logger.debug(f"⚠️ Could not standardize date '{date_str}'")
        return None
    
    # Return standardized format
    standardized = parsed_date.strftime("%Y-%m-%d")
    logger.debug(f"βœ… Standardized date '{date_str}' to '{standardized}'")
    return standardized


def parse_date_input(date_input: str) -> Optional[datetime]:
    """
    Parse date input from UI (expected to be in YYYY-MM-DD format)
    
    Args:
        date_input: Date string from UI input (YYYY-MM-DD format)
        
    Returns:
        datetime object if parsing successful, None otherwise
    """
    if not date_input or not date_input.strip():
        return None
    
    date_input = date_input.strip()
    
    try:
        # Try parsing as YYYY-MM-DD
        parsed_date = datetime.strptime(date_input, "%Y-%m-%d")
        logger.debug(f"βœ… Successfully parsed date input '{date_input}' to {parsed_date}")
        return parsed_date
    except ValueError:
        try:
            # Try using dateutil as fallback
            parsed_date = date_parser.parse(date_input, fuzzy=False)
            logger.debug(f"βœ… Successfully parsed date input '{date_input}' to {parsed_date} using dateutil")
            return parsed_date
        except (ValueError, TypeError) as e:
            logger.warning(f"⚠️ Could not parse date input '{date_input}': {str(e)}")
            return None


def is_date_in_range(article_date_str: str, start_date: Optional[datetime], end_date: Optional[datetime], include_missing: bool = True) -> bool:
    """
    Check if article date falls within the selected date range
    
    Args:
        article_date_str: Article date as string
        start_date: Start date of range (inclusive), None if no start date
        end_date: End date of range (inclusive), None if no end date
        include_missing: If True, include articles with missing/invalid dates. If False, exclude them.
        
    Returns:
        True if article date is in range (or if no date range provided), False otherwise
    """
    # If no date range provided, include all articles
    if start_date is None and end_date is None:
        return True
    
    # Try to parse article date
    article_date = parse_article_date(article_date_str)
    
    # Handle missing/invalid dates
    if article_date is None:
        logger.debug(f"⚠️ Could not parse article date '{article_date_str}', include_missing={include_missing}")
        return include_missing
    
    # Check if date is within range
    in_range = True
    
    if start_date is not None:
        # Normalize to start of day for comparison
        start_normalized = start_date.replace(hour=0, minute=0, second=0, microsecond=0)
        article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
        if article_normalized < start_normalized:
            in_range = False
            logger.debug(f"πŸ“… Article date {article_normalized} is before start date {start_normalized}")
    
    if end_date is not None and in_range:
        # Normalize to end of day for comparison
        end_normalized = end_date.replace(hour=23, minute=59, second=59, microsecond=999999)
        article_normalized = article_date.replace(hour=0, minute=0, second=0, microsecond=0)
        if article_normalized > end_normalized:
            in_range = False
            logger.debug(f"πŸ“… Article date {article_normalized} is after end date {end_normalized}")
    
    if in_range:
        logger.debug(f"βœ… Article date {article_date} is within range [{start_date}, {end_date}]")
    
    return in_range