File size: 5,423 Bytes
6b9e3e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 |
"""
Input validation utilities
"""
import logging
from typing import Dict, List, Optional, Any
import pandas as pd
from config.constants import MAX_FILE_SIZE, ALLOWED_EXTENSIONS
logger = logging.getLogger(__name__)
def validate_file_upload(filename: str, filesize: int) -> Dict[str, Any]:
"""
Validate uploaded file
Args:
filename: Name of the uploaded file
filesize: Size of the file in bytes
Returns:
Validation result dictionary
"""
issues = []
# Check file extension
extension = filename.split('.')[-1].lower() if '.' in filename else ''
if extension not in ALLOWED_EXTENSIONS:
issues.append(f"Invalid file type '{extension}'. Allowed: {', '.join(ALLOWED_EXTENSIONS)}")
# Check file size
if filesize > MAX_FILE_SIZE:
max_mb = MAX_FILE_SIZE / (1024 * 1024)
actual_mb = filesize / (1024 * 1024)
issues.append(f"File too large ({actual_mb:.1f}MB). Maximum: {max_mb:.0f}MB")
if filesize == 0:
issues.append("File is empty")
if issues:
return {'valid': False, 'issues': issues}
return {'valid': True}
def validate_column_selection(
data: pd.DataFrame,
date_column: Optional[str],
target_column: Optional[str]
) -> Dict[str, Any]:
"""
Validate column selection
Args:
data: DataFrame to validate
date_column: Selected date column
target_column: Selected target column
Returns:
Validation result dictionary
"""
issues = []
if date_column is None:
issues.append("Please select a date column")
elif date_column not in data.columns:
issues.append(f"Date column '{date_column}' not found in data")
if target_column is None:
issues.append("Please select a target column")
elif target_column not in data.columns:
issues.append(f"Target column '{target_column}' not found in data")
# Check if columns are the same
if date_column and target_column and date_column == target_column:
issues.append("Date and target columns must be different")
if issues:
return {'valid': False, 'issues': issues}
return {'valid': True}
def validate_forecast_parameters(
horizon: int,
confidence_levels: List[int],
data_length: int
) -> Dict[str, Any]:
"""
Validate forecast parameters
Args:
horizon: Forecast horizon
confidence_levels: List of confidence levels
data_length: Length of the input data
Returns:
Validation result dictionary
"""
issues = []
warnings = []
# Validate horizon
if horizon <= 0:
issues.append("Forecast horizon must be positive")
elif horizon > 365:
warnings.append("Very long forecast horizon (>365 days) may be unreliable")
# Check if sufficient data
if data_length < horizon * 2:
warnings.append(
f"Limited historical data ({data_length} points) for {horizon}-period forecast. "
"Recommend at least 2x horizon length."
)
# Validate confidence levels
if not confidence_levels:
issues.append("Please select at least one confidence level")
for cl in confidence_levels:
if cl <= 0 or cl >= 100:
issues.append(f"Invalid confidence level: {cl}%. Must be between 0 and 100.")
if issues:
return {'valid': False, 'issues': issues, 'warnings': warnings}
if warnings:
return {'valid': True, 'warnings': warnings}
return {'valid': True}
def sanitize_input(text: str, max_length: int = 1000) -> str:
"""
Sanitize text input
Args:
text: Input text
max_length: Maximum allowed length
Returns:
Sanitized text
"""
if text is None:
return ""
# Remove control characters
text = ''.join(char for char in text if ord(char) >= 32 or char in '\n\r\t')
# Limit length
if len(text) > max_length:
text = text[:max_length]
return text.strip()
def validate_data_quality(data: pd.DataFrame, target_column: str) -> Dict[str, Any]:
"""
Validate data quality for forecasting
Args:
data: Input DataFrame
target_column: Name of the target column
Returns:
Quality validation result
"""
issues = []
warnings = []
# Check for all NaN values
if data[target_column].isna().all():
issues.append("Target column contains only missing values")
return {'valid': False, 'issues': issues}
# Check for constant values
if data[target_column].nunique() == 1:
warnings.append("Target column has constant values - forecast may be trivial")
# Check for infinite values
inf_count = np.isinf(data[target_column]).sum()
if inf_count > 0:
issues.append(f"Target column contains {inf_count} infinite values")
# Check for very high variance
if data[target_column].std() > 1e6:
warnings.append("Target column has very high variance - consider scaling")
# Check for zeros
zero_pct = (data[target_column] == 0).sum() / len(data) * 100
if zero_pct > 50:
warnings.append(f"{zero_pct:.1f}% of values are zero")
if issues:
return {'valid': False, 'issues': issues, 'warnings': warnings}
if warnings:
return {'valid': True, 'warnings': warnings}
return {'valid': True}
import numpy as np
|