File size: 5,423 Bytes
6b9e3e8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Input validation utilities
"""

import logging
from typing import Dict, List, Optional, Any
import pandas as pd
from config.constants import MAX_FILE_SIZE, ALLOWED_EXTENSIONS

logger = logging.getLogger(__name__)


def validate_file_upload(filename: str, filesize: int) -> Dict[str, Any]:
    """
    Validate uploaded file

    Args:
        filename: Name of the uploaded file
        filesize: Size of the file in bytes

    Returns:
        Validation result dictionary
    """
    issues = []

    # Check file extension
    extension = filename.split('.')[-1].lower() if '.' in filename else ''
    if extension not in ALLOWED_EXTENSIONS:
        issues.append(f"Invalid file type '{extension}'. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")

    # Check file size
    if filesize > MAX_FILE_SIZE:
        max_mb = MAX_FILE_SIZE / (1024 * 1024)
        actual_mb = filesize / (1024 * 1024)
        issues.append(f"File too large ({actual_mb:.1f}MB). Maximum: {max_mb:.0f}MB")

    if filesize == 0:
        issues.append("File is empty")

    if issues:
        return {'valid': False, 'issues': issues}

    return {'valid': True}


def validate_column_selection(
    data: pd.DataFrame,
    date_column: Optional[str],
    target_column: Optional[str]
) -> Dict[str, Any]:
    """
    Validate column selection

    Args:
        data: DataFrame to validate
        date_column: Selected date column
        target_column: Selected target column

    Returns:
        Validation result dictionary
    """
    issues = []

    if date_column is None:
        issues.append("Please select a date column")
    elif date_column not in data.columns:
        issues.append(f"Date column '{date_column}' not found in data")

    if target_column is None:
        issues.append("Please select a target column")
    elif target_column not in data.columns:
        issues.append(f"Target column '{target_column}' not found in data")

    # Check if columns are the same
    if date_column and target_column and date_column == target_column:
        issues.append("Date and target columns must be different")

    if issues:
        return {'valid': False, 'issues': issues}

    return {'valid': True}


def validate_forecast_parameters(
    horizon: int,
    confidence_levels: List[int],
    data_length: int
) -> Dict[str, Any]:
    """
    Validate forecast parameters

    Args:
        horizon: Forecast horizon
        confidence_levels: List of confidence levels
        data_length: Length of the input data

    Returns:
        Validation result dictionary
    """
    issues = []
    warnings = []

    # Validate horizon
    if horizon <= 0:
        issues.append("Forecast horizon must be positive")
    elif horizon > 365:
        warnings.append("Very long forecast horizon (>365 days) may be unreliable")

    # Check if sufficient data
    if data_length < horizon * 2:
        warnings.append(
            f"Limited historical data ({data_length} points) for {horizon}-period forecast. "
            "Recommend at least 2x horizon length."
        )

    # Validate confidence levels
    if not confidence_levels:
        issues.append("Please select at least one confidence level")

    for cl in confidence_levels:
        if cl <= 0 or cl >= 100:
            issues.append(f"Invalid confidence level: {cl}%. Must be between 0 and 100.")

    if issues:
        return {'valid': False, 'issues': issues, 'warnings': warnings}

    if warnings:
        return {'valid': True, 'warnings': warnings}

    return {'valid': True}


def sanitize_input(text: str, max_length: int = 1000) -> str:
    """
    Sanitize text input

    Args:
        text: Input text
        max_length: Maximum allowed length

    Returns:
        Sanitized text
    """
    if text is None:
        return ""

    # Remove control characters
    text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')

    # Limit length
    if len(text) > max_length:
        text = text[:max_length]

    return text.strip()


def validate_data_quality(data: pd.DataFrame, target_column: str) -> Dict[str, Any]:
    """
    Validate data quality for forecasting

    Args:
        data: Input DataFrame
        target_column: Name of the target column

    Returns:
        Quality validation result
    """
    issues = []
    warnings = []

    # Check for all NaN values
    if data[target_column].isna().all():
        issues.append("Target column contains only missing values")
        return {'valid': False, 'issues': issues}

    # Check for constant values
    if data[target_column].nunique() == 1:
        warnings.append("Target column has constant values - forecast may be trivial")

    # Check for infinite values
    inf_count = np.isinf(data[target_column]).sum()
    if inf_count > 0:
        issues.append(f"Target column contains {inf_count} infinite values")

    # Check for very high variance
    if data[target_column].std() > 1e6:
        warnings.append("Target column has very high variance - consider scaling")

    # Check for zeros
    zero_pct = (data[target_column] == 0).sum() / len(data) * 100
    if zero_pct > 50:
        warnings.append(f"{zero_pct:.1f}% of values are zero")

    if issues:
        return {'valid': False, 'issues': issues, 'warnings': warnings}

    if warnings:
        return {'valid': True, 'warnings': warnings}

    return {'valid': True}


import numpy as np