Spaces:

iamismail
/

Raagsan

Runtime error

App Files Files Community

Raagsan / document_scraper.py

iamismail

Initial clean commit for Raagsan Space

439e1dd about 1 month ago

raw

history blame contribute delete

122 kB

	"""
	Document Scraper - Handles PDF and document processing
	"""

	import asyncio
	import json
	import logging
	import os
	import hashlib
	import tempfile
	import requests
	import urllib3
	from datetime import datetime
	from typing import List, Dict, Any
	from urllib.parse import urlparse, urlunparse, unquote

	# Import common functions from scraper_common
	from scraper_common import (
	WEBSITE_CONFIG, MAX_PDF_LIMIT, MAX_ARTICLE_LIMIT, MAX_PAGE_LIMIT,
	ensure_archive_directory, convert_to_absolute_url,
	set_scraping_cancelled, scraping_cancelled, force_close_browser,
	reset_global_pdf_count, increment_global_pdf_count, get_global_pdf_count, is_pdf_limit_reached,
	get_pdf_websites
	)

	# Import date filtering utilities
	from date_filter import is_date_in_range, parse_date_input, standardize_date

	# Suppress SSL warnings
	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - [%(filename)s:%(lineno)d] - %(message)s'
	)
	logger = logging.getLogger(__name__)


	def construct_navigation_url(base_url: str, nav_addition: str) -> str:
	"""
	Construct navigation URL by properly handling trailing slashes and query parameters
	"""
	# Remove trailing slash from base URL if it exists
	if base_url.endswith('/'):
	base_url = base_url.rstrip('/')

	# Check if nav_addition starts with / or ?
	if nav_addition.startswith('/'):
	# Direct path addition
	return base_url + nav_addition
	elif nav_addition.startswith('?'):
	# Query parameter addition
	return base_url + nav_addition
	else:
	# Default: add as path
	return base_url + '/' + nav_addition

	# Global variables for document processing
	mopnd_article_dates = {}
	mopnd_article_titles = {}

	def clear_mopnd_cache():
	"""Clear MOPND article cache when starting a new scraping session"""
	global mopnd_article_dates, mopnd_article_titles
	mopnd_article_dates.clear()
	mopnd_article_titles.clear()
	logger.info("🧹 Cleared MOPND article cache")

	def get_pdf_hash(pdf_url: str) -> str:
	"""Generate a hash for the PDF URL to use as cache key"""
	return hashlib.md5(pdf_url.encode()).hexdigest()

	def is_pdf_archived(pdf_url: str, source: str) -> bool:
	"""Check if PDF is already archived"""
	ensure_archive_directory()
	hash_key = get_pdf_hash(pdf_url)
	archive_dir = f"archive/{source}"
	date_folder = datetime.now().strftime("%Y-%m-%d")
	archive_path = f"{archive_dir}/{date_folder}"

	if os.path.exists(archive_path):
	for file in os.listdir(archive_path):
	if file.startswith(hash_key):
	return True
	return False

	def get_archived_pdf_path(pdf_url: str, source: str) -> str:
	"""Get the archived PDF file path"""
	ensure_archive_directory()
	hash_key = get_pdf_hash(pdf_url)
	archive_dir = f"archive/{source}"
	date_folder = datetime.now().strftime("%Y-%m-%d")
	archive_path = f"{archive_dir}/{date_folder}"

	if os.path.exists(archive_path):
	for file in os.listdir(archive_path):
	if file.startswith(hash_key):
	return os.path.join(archive_path, file)
	return None

	def archive_pdf(pdf_url: str, content: bytes, source: str) -> str:
	"""Archive PDF content and return the local file path"""
	logger.info(f"💾 Starting PDF archiving process...")
	ensure_archive_directory()

	# Create source-specific archive directory
	archive_dir = f"archive/{source}"
	date_folder = datetime.now().strftime("%Y-%m-%d")
	archive_path = f"{archive_dir}/{date_folder}"

	# Create directory if it doesn't exist
	os.makedirs(archive_path, exist_ok=True)

	# Generate unique filename using hash
	hash_key = get_pdf_hash(pdf_url)
	filename = f"{hash_key}.pdf"
	file_path = os.path.join(archive_path, filename)

	# Save PDF content
	with open(file_path, 'wb') as f:
	f.write(content)

	logger.info(f"📁 PDF archived to: {file_path}")

	# Update archive index
	update_archive_index(pdf_url, file_path, source)

	return file_path

	def archive_file(file_url: str, content: bytes, source: str, file_extension: str = "csv") -> str:
	"""Archive file content (CSV, etc.) and return the local file path"""
	logger.info(f"💾 Starting file archiving process for {file_extension.upper()}...")
	ensure_archive_directory()

	# Create source-specific archive directory
	archive_dir = f"archive/{source}"
	date_folder = datetime.now().strftime("%Y-%m-%d")
	archive_path = f"{archive_dir}/{date_folder}"

	# Create directory if it doesn't exist
	os.makedirs(archive_path, exist_ok=True)

	# Generate unique filename using hash
	hash_key = get_pdf_hash(file_url)
	filename = f"{hash_key}.{file_extension}"
	file_path = os.path.join(archive_path, filename)

	# Save file content
	with open(file_path, 'wb') as f:
	f.write(content)

	logger.info(f"📁 File archived to: {file_path}")

	# Update archive index
	update_archive_index(file_url, file_path, source)

	return file_path

	def update_archive_index(pdf_url: str, local_path: str, source: str):
	"""Update the archive index with PDF information"""
	ensure_archive_directory()
	index_file = f"archive/{source}/index.json"

	# Load existing index or create new one
	if os.path.exists(index_file):
	try:
	with open(index_file, 'r') as f:
	index = json.load(f)
	except:
	index = {}
	else:
	index = {}

	# Add new entry
	hash_key = get_pdf_hash(pdf_url)
	index[hash_key] = {
	"url": pdf_url,
	"local_path": local_path,
	"source": source,
	"archived_date": datetime.now().isoformat()
	}

	# Save updated index
	with open(index_file, 'w') as f:
	json.dump(index, f, indent=2)

	def download_and_save_pdf(pdf_url: str, source: str = "unknown") -> dict:
	"""
	Download PDF and save it to archive, return metadata
	"""
	try:
	logger.info(f"⬇️ Downloading PDF: {pdf_url}")
	logger.info(f"📁 Source: {source}")

	# Check if PDF is already archived
	if is_pdf_archived(pdf_url, source):
	logger.info(f"✅ PDF already archived: {pdf_url}")
	cached_path = get_archived_pdf_path(pdf_url, source)
	return {
	"success": True,
	"path": cached_path,
	"size": os.path.getsize(cached_path),
	"message": "PDF already archived"
	}

	# Create headers to mimic a browser request
	parsed_url = urlparse(pdf_url)
	base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.9",
	"Connection": "keep-alive",
	"Referer": base_domain
	}

	logger.info(f"🌐 Using base domain as referer: {base_domain}")

	# Try direct download with headers first
	try:
	session = requests.Session()
	# Disable SSL verification for problematic certificates
	session.verify = False

	# First, visit the domain homepage to get cookies
	session.get(base_domain, headers=headers, timeout=30, verify=False)
	logger.info(f"🍪 Visited domain homepage to gather cookies")

	# Then try to download the PDF with proper headers
	response = session.get(pdf_url, headers=headers, timeout=30, verify=False)
	response.raise_for_status()
	logger.info(f"✅ PDF downloaded successfully. Size: {len(response.content)} bytes")
	except Exception as e:
	logger.error(f"❌ Error downloading PDF: {str(e)}")
	raise

	# Archive the PDF
	archived_path = archive_pdf(pdf_url, response.content, source)
	logger.info(f"📁 PDF archived to: {archived_path}")

	return {
	"success": True,
	"path": archived_path,
	"size": len(response.content),
	"message": "PDF downloaded and archived successfully"
	}
	except Exception as e:
	# Direct download failed, return error without fallback
	logger.error(f"❌ PDF download failed for {pdf_url}: {str(e)}")
	return {
	"success": False,
	"path": None,
	"size": 0,
	"message": f"Error downloading PDF: {str(e)}"
	}

	def download_and_save_file(file_url: str, source: str = "unknown", file_type: str = "csv") -> dict:
	"""
	Download file (CSV, etc.) and save it to archive, return metadata
	"""
	try:
	logger.info(f"⬇️ Downloading {file_type.upper()}: {file_url}")
	logger.info(f"📁 Source: {source}")

	# Determine file extension
	file_extension = file_type.lower()
	if file_extension not in ["csv", "xlsx", "xls", "png", "jpg", "jpeg", "gif", "webp"]:
	# Try to determine from URL if not in known types
	if file_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
	file_extension = file_url.lower().split('.')[-1]
	else:
	file_extension = "csv" # Default to CSV

	# Check if file is already archived (using same hash mechanism as PDFs)
	if is_pdf_archived(file_url, source):
	logger.info(f"✅ File already archived: {file_url}")
	cached_path = get_archived_pdf_path(file_url, source)
	# Check if the cached file has the right extension
	if cached_path and os.path.exists(cached_path):
	return {
	"success": True,
	"path": cached_path,
	"size": os.path.getsize(cached_path),
	"file_type": file_type,
	"message": "File already archived"
	}

	# Create headers to mimic a browser request
	parsed_url = urlparse(file_url)
	base_domain = f"{parsed_url.scheme}://{parsed_url.netloc}"

	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.9",
	"Connection": "keep-alive",
	"Referer": base_domain
	}

	logger.info(f"🌐 Using base domain as referer: {base_domain}")

	# Try direct download with headers first
	try:
	session = requests.Session()
	# Disable SSL verification for problematic certificates
	session.verify = False

	# First, visit the domain homepage to get cookies
	session.get(base_domain, headers=headers, timeout=30, verify=False)
	logger.info(f"🍪 Visited domain homepage to gather cookies")

	# Then try to download the file with proper headers
	response = session.get(file_url, headers=headers, timeout=30, verify=False)
	response.raise_for_status()
	logger.info(f"✅ {file_type.upper()} downloaded successfully. Size: {len(response.content)} bytes")
	except Exception as e:
	logger.error(f"❌ Error downloading {file_type.upper()}: {str(e)}")
	raise

	# Archive the file
	archived_path = archive_file(file_url, response.content, source, file_extension)
	logger.info(f"📁 {file_type.upper()} archived to: {archived_path}")

	return {
	"success": True,
	"path": archived_path,
	"size": len(response.content),
	"file_type": file_type,
	"message": f"{file_type.upper()} downloaded and archived successfully"
	}
	except Exception as e:
	# Direct download failed, return error without fallback
	logger.error(f"❌ {file_type.upper()} download failed for {file_url}: {str(e)}")
	return {
	"success": False,
	"path": None,
	"size": 0,
	"file_type": file_type,
	"message": f"Error downloading {file_type.upper()}: {str(e)}"
	}

	def get_website_type_from_source(source: str) -> str:
	"""
	Map source name to website type for config lookup
	"""
	source_to_type = {
	"FS Cluster": "fscluster",
	"ReliefWeb": "reliefweb",
	"NBS Somalia": "nbs",
	"HDX": "hdx",
	"HDX Humanitarian Data Exchange": "hdx",
	"LogCluster": "logcluster",
	"FSNau": "fsnau",
	"FSNau - Food Security and Nutrition Analysis Unit": "fsnau",
	"FSNau Publications": "fsnau_publications",
	"FEWS NET": "fews",
	"FEWS NET - Famine Early Warning Systems Network": "fews",
	"ICPAC": "icpac",
	"ICPAC - IGAD Climate Prediction and Applications Centre": "icpac",
	"ICPAC - IGAD Climate Prediction and Applications Centre - Seasonal Forecast": "icpac_seasonal_forecast",
	"FAO SWALIM": "faoswalim",
	"FAO SWALIM Publications": "faoswalim_publications",
	"FAO SWALIM Journals": "faoswalim_journals",
	"FAO SWALIM Events": "faoswalim_events",
	"FAO SWALIM Articles": "faoswalim_articles",
	"FAO SWALIM Flood Watch": "faoswalim_flood_watch",
	"FAO SWALIM Water Publications": "faoswalim_water_publications",
	"MOPND Somaliland": "mopnd",
	"Copernicus Drought Observatory": "copernicus_drought",
	"fscluster": "fscluster",
	"reliefweb": "reliefweb",
	"NBS": "nbs",
	"HDX": "hdx",
	"LogCluster": "logcluster",
	"FSNau": "fsnau",
	"FSNau Publications": "fsnau_publications",
	"FEWS NET": "fews",
	"ICPAC": "icpac",
	"FAO SWALIM": "faoswalim"
	}
	return source_to_type.get(source, "fscluster") # Default fallback


	def extract_pdf_text(pdf_url: str, source: str = "unknown") -> str:
	"""
	Extract text content from archived PDF using multiple methods
	"""
	try:
	logger.info(f"🔍 Starting PDF text extraction for URL: {pdf_url}")
	logger.info(f"📁 Source: {source}")

	# Check if URL is relative and convert to absolute URL
	parsed_url = urlparse(pdf_url)

	# If the URL is relative (no scheme/netloc), we need to construct complete URL
	if not parsed_url.scheme and pdf_url.startswith('/'):
	# Get website type from source and lookup base_url from config
	website_type = get_website_type_from_source(source)
	config = WEBSITE_CONFIG.get(website_type, {})
	base_url = config.get('base_url', 'https://fscluster.org') # Default fallback

	logger.info(f"🔗 Using base_url from config for {website_type}: {base_url}")

	# Construct complete URL
	complete_url = f"{base_url}{pdf_url}"
	logger.info(f"🔗 Converted relative URL {pdf_url} to absolute URL: {complete_url}")
	pdf_url = complete_url

	# Get archived PDF path
	if is_pdf_archived(pdf_url, source):
	cached_path = get_archived_pdf_path(pdf_url, source)
	logger.info(f"📂 Using archived PDF: {cached_path}")
	result = extract_text_from_pdf_file(cached_path)
	logger.info(f"📄 Extracted text length: {len(result)} characters")

	if not result.strip():
	logger.warning("⚠️ No text extracted from PDF - might be image-based or corrupted")
	else:
	logger.info(f"✅ Successfully extracted text from PDF")

	return result
	else:
	# Try to download the PDF first if not in archive
	logger.info(f"❌ PDF not found in archive: {pdf_url}")
	logger.info(f"⬇️ Attempting to download PDF now...")

	# Attempt the download
	download_result = download_and_save_pdf(pdf_url, source)
	if download_result["success"]:
	logger.info(f"✅ Successfully downloaded PDF: {download_result['path']}")
	# Now extract text from the newly downloaded PDF
	result = extract_text_from_pdf_file(download_result["path"])
	return result
	else:
	logger.error(f"❌ Failed to download PDF: {download_result['message']}")

	# Special failure message for fscluster
	if source.lower() == "fscluster" and "403" in download_result["message"]:
	return f"PDF download blocked by fscluster.org (403 Forbidden). Try visiting the document page first in your browser before scraping, or use authenticated session cookies: {pdf_url}"
	else:
	return f"PDF not found in archive and download failed: {pdf_url}"

	except Exception as e:
	logger.error(f"❌ Error extracting PDF text from {pdf_url}: {str(e)}")
	return f"Error extracting PDF: {str(e)}"

	def extract_text_from_pdf_file(pdf_file_or_path):
	"""
	Extract text from PDF using multiple methods for better compatibility
	"""
	text_content = ""

	try:
	logger.info(f"🔍 Starting PDF text extraction...")

	# Method 1: Try pypdf first (most reliable for text-based PDFs)
	try:
	logger.info(f"📖 Trying pypdf extraction...")
	import pypdf

	if isinstance(pdf_file_or_path, str):
	# File path
	logger.info(f"📁 Reading from file path: {pdf_file_or_path}")
	with open(pdf_file_or_path, 'rb') as file:
	pdf_reader = pypdf.PdfReader(file)
	logger.info(f"📄 PDF has {len(pdf_reader.pages)} pages")
	for i, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	if page_text:
	text_content += page_text + "\n"
	else:
	# BytesIO objects
	logger.info(f"📁 Reading from BytesIO object")
	pdf_reader = pypdf.PdfReader(pdf_file_or_path)
	logger.info(f"📄 PDF has {len(pdf_reader.pages)} pages")
	for i, page in enumerate(pdf_reader.pages):
	page_text = page.extract_text()
	if page_text:
	text_content += page_text + "\n"

	if text_content.strip():
	logger.info(f"✅ Successfully extracted text using pypdf: {len(text_content)} characters")
	return text_content.strip()
	else:
	logger.warning("⚠️ pypdf extracted no text")
	except Exception as e:
	logger.warning(f"⚠️ pypdf extraction failed: {str(e)}")

	# Method 2: Try pdfplumber (better for complex layouts)
	try:
	logger.info(f"📖 Trying pdfplumber extraction...")
	import pdfplumber

	if isinstance(pdf_file_or_path, str):
	with pdfplumber.open(pdf_file_or_path) as pdf:
	logger.info(f"📄 PDF has {len(pdf.pages)} pages")
	for i, page in enumerate(pdf.pages):
	page_text = page.extract_text()
	if page_text:
	text_content += page_text + "\n"
	else:
	# For BytesIO objects, we need to save to temp file first
	with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file:
	temp_file.write(pdf_file_or_path.getvalue())
	temp_file.flush()

	with pdfplumber.open(temp_file.name) as pdf:
	logger.info(f"📄 PDF has {len(pdf.pages)} pages")
	for i, page in enumerate(pdf.pages):
	page_text = page.extract_text()
	if page_text:
	text_content += page_text + "\n"

	# Clean up temp file
	os.unlink(temp_file.name)
	logger.info(f"🗑️ Temp file cleaned up")

	if text_content.strip():
	logger.info(f"✅ Successfully extracted text using pdfplumber: {len(text_content)} characters")
	return text_content.strip()
	else:
	logger.warning("⚠️ pdfplumber extracted no text")
	except ImportError:
	logger.warning("⚠️ pdfplumber not available")
	except Exception as e:
	logger.warning(f"⚠️ pdfplumber extraction failed: {str(e)}")

	# Method 3: Try PyMuPDF (fitz) for better text extraction
	try:
	logger.info(f"📖 Trying PyMuPDF extraction...")
	import fitz # PyMuPDF

	if isinstance(pdf_file_or_path, str):
	doc = fitz.open(pdf_file_or_path)
	else:
	doc = fitz.open(stream=pdf_file_or_path.getvalue(), filetype="pdf")

	logger.info(f"📄 PDF has {doc.page_count} pages")
	for page_num in range(doc.page_count):
	page = doc.load_page(page_num)
	page_text = page.get_text()
	if page_text:
	text_content += page_text + "\n"

	doc.close()

	if text_content.strip():
	logger.info(f"✅ Successfully extracted text using PyMuPDF: {len(text_content)} characters")
	return text_content.strip()
	else:
	logger.warning("⚠️ PyMuPDF extracted no text")
	except ImportError:
	logger.warning("⚠️ PyMuPDF not available")
	except Exception as e:
	logger.warning(f"⚠️ PyMuPDF extraction failed: {str(e)}")

	# Try one more advanced method for text-within-images using OCR
	# This is especially helpful for LogCluster PDFs which often have text embedded in images
	if not text_content.strip() or len(text_content.strip()) < 500: # If no text or very little text extracted
	try:
	logger.info(f"📖 Trying OCR extraction as last resort...")
	import pytesseract
	from PIL import Image
	from pdf2image import convert_from_path, convert_from_bytes

	if isinstance(pdf_file_or_path, str):
	# Convert PDF to images
	images = convert_from_path(pdf_file_or_path, dpi=300)
	else:
	# For BytesIO objects
	images = convert_from_bytes(pdf_file_or_path.getvalue(), dpi=300)

	logger.info(f"🖼️ Converted PDF to {len(images)} images for OCR")

	for i, image in enumerate(images):
	# Extract text using OCR
	page_text = pytesseract.image_to_string(image, lang='eng')
	if page_text.strip():
	text_content += f"Page {i+1} (OCR):\n{page_text}\n"
	logger.info(f"📄 OCR extracted {len(page_text)} characters from page {i+1}")

	if text_content.strip():
	logger.info(f"✅ Successfully extracted text using OCR: {len(text_content)} characters")
	return text_content.strip()
	else:
	logger.warning("⚠️ OCR extracted no text")
	except ImportError:
	logger.warning("⚠️ OCR libraries not available (pytesseract, pdf2image)")
	except Exception as e:
	logger.warning(f"❌ OCR extraction failed: {str(e)}")

	# If we got some text content from earlier methods, return it even if it's partial
	if text_content.strip():
	logger.info(f"⚠️ Returning partial text extraction ({len(text_content.strip())} characters)")
	return text_content.strip()

	# If all methods fail, return a message
	logger.warning("❌ All PDF extraction methods failed")
	return "PDF text extraction failed - document may be image-based or corrupted"

	except Exception as e:
	logger.error(f"❌ Error in PDF text extraction: {str(e)}")
	return f"PDF text extraction failed: {str(e)}"

	async def download_all_pdfs_from_page(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Download all PDFs from multiple pages with pagination support
	Supports both approaches:
	1. Direct PDF discovery (pdf_links only)
	2. Page links first, then PDF discovery (page_links + pdf_links)
	"""
	try:
	logger.info(f"📄 Starting PDF download from page: {url}")
	logger.info(f"📁 Source: {source}")

	# Clear MOPND cache if this is a MOPND scraping session
	if source == "mopnd":
	clear_mopnd_cache()

	# Reset global PDF counter at the start of processing
	reset_global_pdf_count()
	logger.info(f"🔄 Reset global PDF counter. Limit: {MAX_PDF_LIMIT}")

	# Check for special table extraction mode
	extract_table_as_csv = config.get("extract_table_as_csv", False)
	if extract_table_as_csv:
	logger.info("📊 Using table extraction mode: Extract table data and convert to CSV")
	return await extract_table_as_csv_file(page, url, config, source, start_date, end_date)

	# Determine which approach to use
	page_links_selector = config.get("page_links")
	pdf_links_selector = config.get("pdf_links")
	file_links_selector = config.get("file_links")

	# Debug logging
	logger.debug(f"🔍 Config check for source '{source}': page_links={page_links_selector}, pdf_links={pdf_links_selector}, file_links={file_links_selector}")

	# If page_links is configured and not null/empty, use Approach 2
	# This allows us to navigate to individual pages and extract PDFs from each
	if page_links_selector and pdf_links_selector:
	# Approach 2: Page links first, then PDF discovery
	logger.info("🔄 Using Approach 2: Page links first, then PDF discovery")
	return await download_pdfs_via_page_links(page, url, config, source, start_date, end_date)
	elif page_links_selector and file_links_selector:
	# Approach 2: Page links first, then file discovery
	logger.info("🔄 Using Approach 2: Page links first, then file discovery")
	return await download_pdfs_via_page_links(page, url, config, source, start_date, end_date)
	elif pdf_links_selector or file_links_selector:
	# Approach 1: Direct PDF/file discovery
	logger.info("🔄 Using Approach 1: Direct PDF/file discovery")
	return await download_pdfs_direct(page, url, config, source, start_date, end_date)
	else:
	logger.error("❌ No pdf_links, file_links, or page_links configured")
	return []

	except Exception as e:
	logger.error(f"❌ Error downloading PDFs from pages: {str(e)}")
	return []


	async def extract_table_as_csv_file(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Special function to extract table data and convert to CSV
	"""
	try:
	logger.info(f"📊 Starting table extraction from page: {url}")
	logger.info(f"📁 Source: {source}")

	# Navigate to the page
	await page.goto(url, wait_until="domcontentloaded", timeout=30000)

	# Get content selector (should be "td, th" for table cells)
	content_selector = config.get("content")
	if not content_selector:
	logger.error("❌ No content selector configured for table extraction")
	return []

	logger.info(f"🔍 Extracting table data using selector: {content_selector}")

	# Extract all table cells (td and th)
	cell_elements = await page.query_selector_all(content_selector)
	logger.info(f"📊 Found {len(cell_elements)} table cells")

	if not cell_elements:
	logger.warning("⚠️ No table cells found")
	return []

	# Extract text from all cells
	cells_data = []
	for element in cell_elements:
	try:
	cell_text = await element.text_content()
	if cell_text:
	cells_data.append(cell_text.strip())
	else:
	cells_data.append("")
	except Exception as e:
	logger.debug(f"⚠️ Error extracting cell text: {str(e)}")
	cells_data.append("")

	# Try to find the table structure to organize data into rows
	# First, try to find all table rows
	table_rows = []
	try:
	# Try to find table rows
	row_elements = await page.query_selector_all("tr")
	if row_elements:
	logger.info(f"📊 Found {len(row_elements)} table rows")
	for row_element in row_elements:
	row_cells = await row_element.query_selector_all("td, th")
	row_data = []
	for cell in row_cells:
	try:
	cell_text = await cell.text_content()
	row_data.append(cell_text.strip() if cell_text else "")
	except:
	row_data.append("")
	if row_data: # Only add non-empty rows
	table_rows.append(row_data)
	except Exception as e:
	logger.warning(f"⚠️ Could not extract table rows: {str(e)}")
	# Fallback: organize cells into rows based on a reasonable assumption
	# If we can't find rows, we'll create a single row with all cells
	if cells_data:
	table_rows = [cells_data]

	if not table_rows:
	logger.warning("⚠️ No table rows extracted")
	return []

	# Convert to CSV format
	import csv
	import io

	csv_buffer = io.StringIO()
	csv_writer = csv.writer(csv_buffer)

	# Write all rows to CSV
	for row in table_rows:
	csv_writer.writerow(row)

	csv_content = csv_buffer.getvalue()
	csv_buffer.close()

	logger.info(f"📊 Generated CSV content: {len(csv_content)} characters, {len(table_rows)} rows")

	# Generate filename
	from datetime import datetime
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	filename = f"river_levels_{timestamp}.csv"

	# Save CSV file to archive
	csv_bytes = csv_content.encode('utf-8')
	csv_file_path = archive_file(url, csv_bytes, source, "csv")

	logger.info(f"📁 CSV file saved to: {csv_file_path}")

	# Create document entry
	document = {
	"url": url,
	"local_path": csv_file_path,
	"size": len(csv_bytes),
	"title": f"River Levels Data - {datetime.now().strftime('%Y-%m-%d')}",
	"source": source,
	"extracted_text": f"CSV File: {filename}\nFile Path: {csv_file_path}\nTotal Rows: {len(table_rows)}\n\nPreview:\n{csv_content[:500]}...",
	"file_type": "CSV",
	"date": datetime.now().strftime("%Y-%m-%d")
	}

	# Increment global PDF counter (using same counter for files)
	increment_global_pdf_count()

	logger.info(f"✅ Successfully extracted table data and saved as CSV")
	return [document]

	except Exception as e:
	logger.error(f"❌ Error extracting table as CSV: {str(e)}")
	return []


	async def download_pdfs_direct(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Approach 1: Direct PDF discovery on listing pages
	"""
	try:
	# Check if navigation is configured
	navigation_selector = config.get("navigation_selector")
	navigation_url_addition = config.get("navigation_url_addition")
	start_page = config.get("start_page", 1)

	all_pdfs = []
	seen_pdf_urls = set() # Track unique PDF URLs to detect duplicates
	current_page = start_page
	consecutive_empty_pages = 0
	max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content

	# Navigate to the initial page
	await page.goto(url, wait_until="domcontentloaded", timeout=30000)

	# Handle pagination if configured
	if navigation_selector and navigation_url_addition:
	logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
	logger.info(f"📄 Starting from page: {start_page}")

	while True:
	logger.info(f"📄 Processing page {current_page}")

	# Check MAX_PAGE_LIMIT if set
	if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
	logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
	break

	# Navigate to current page if not the first page
	if current_page > start_page:
	nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
	nav_url = construct_navigation_url(url, nav_url_addition)
	logger.info(f"🧭 Navigating to: {nav_url}")
	await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
	# Check for recaptcha and wait if present
	captcha_result = await check_and_wait_for_recaptcha(page, config)
	if captcha_result == "CAPTCHA_TIMEOUT":
	logger.error("❌ Captcha detected but not solved within timeout period")
	return []

	# Check if navigation element exists for next page
	nav_element = await page.query_selector(navigation_selector)
	if current_page == start_page and nav_element:
	logger.info("✅ Navigation element found, more pages available")
	elif current_page > start_page and not nav_element:
	logger.info("📄 No more navigation elements found, stopping pagination")
	break

	# Check global PDF limit before processing page
	if is_pdf_limit_reached():
	logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping pagination")
	break

	# Extract PDFs from current page
	page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date)

	if page_pdfs:
	# Check for new (non-duplicate) PDFs
	new_pdfs = []
	for pdf in page_pdfs:
	pdf_url = pdf.get("url", "")
	if pdf_url and pdf_url not in seen_pdf_urls:
	seen_pdf_urls.add(pdf_url)
	new_pdfs.append(pdf)

	if new_pdfs:
	all_pdfs.extend(new_pdfs)
	consecutive_empty_pages = 0 # Reset counter
	logger.info(f"📄 Found {len(new_pdfs)} new PDFs on page {current_page} (total: {len(page_pdfs)} PDFs on page)")
	else:
	consecutive_empty_pages += 1
	logger.info(f"📄 No new PDFs found on page {current_page} (all {len(page_pdfs)} PDFs were duplicates)")

	# Stop if we've had too many consecutive pages with no new content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
	break
	else:
	consecutive_empty_pages += 1
	logger.info(f"📄 No PDFs found on page {current_page}")

	# Stop if we've had too many consecutive pages with no content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
	break

	current_page += 1

	else:
	# No pagination configured, scrape single page only
	logger.info("📄 No navigation configured - scraping single page only")
	page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date)
	all_pdfs.extend(page_pdfs)

	logger.info(f"📊 Total unique PDFs found across all pages: {len(all_pdfs)}")
	return all_pdfs

	except Exception as e:
	logger.error(f"❌ Error in direct PDF discovery: {str(e)}")
	return []


	async def download_pdfs_via_page_links(page, url: str, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Approach 2: Page links first, then PDF discovery
	1. Go through pagination to collect all page links
	2. Visit each individual page link
	3. Find and download PDFs from each page
	"""
	try:
	logger.info("🔄 Starting Approach 2: Page links first, then PDF discovery")

	# Step 1: Collect all page links through pagination
	logger.info("📋 Step 1: Collecting all page links through pagination")
	all_page_links = await collect_all_page_links(page, url, config, source)

	if not all_page_links:
	logger.warning("⚠️ No page links found")
	return []

	logger.info(f"📋 Collected {len(all_page_links)} page links")

	# Step 2: Visit each page link and extract PDFs
	logger.info("🔍 Step 2: Visiting individual pages to find PDFs")
	all_pdfs = []
	seen_pdf_urls = set()

	for i, page_url in enumerate(all_page_links, 1):
	if scraping_cancelled():
	logger.info("🛑 Scraping cancelled, stopping PDF downloads")
	break

	# Check global PDF limit before processing page
	if is_pdf_limit_reached():
	logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping page processing")
	break

	logger.info(f"📄 Processing page {i}/{len(all_page_links)}: {page_url}")
	logger.info(f"📊 Global PDF count: {get_global_pdf_count()}/{MAX_PDF_LIMIT}")

	try:
	# Navigate to the individual page
	await page.goto(page_url, wait_until="domcontentloaded", timeout=30000)

	# Check for recaptcha and wait if present
	captcha_result = await check_and_wait_for_recaptcha(page, config)
	if captcha_result == "CAPTCHA_TIMEOUT":
	logger.error("❌ Captcha detected but not solved within timeout period")
	return [{
	"title": "CAPTCHA_ERROR",
	"content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": page_url
	}]

	# Extract title from this individual page using title selector (for Approach 2)
	page_title = ""

	# For MOPND, use the cached title from the listing page
	if source == "mopnd":
	# Try exact match first
	if page_url in mopnd_article_titles:
	page_title = mopnd_article_titles[page_url]
	logger.info(f"📝 Using MOPND cached title from listing page: {page_title}")
	else:
	# Try to find a matching URL (handle query params, trailing slashes)
	page_url_parsed = urlparse(page_url)
	page_url_normalized = urlunparse((page_url_parsed.scheme, page_url_parsed.netloc, page_url_parsed.path, '', '', ''))

	# Try normalized URL
	matching_url = None
	for cached_url in mopnd_article_titles.keys():
	cached_parsed = urlparse(cached_url)
	cached_normalized = urlunparse((cached_parsed.scheme, cached_parsed.netloc, cached_parsed.path, '', '', ''))
	if cached_normalized == page_url_normalized:
	matching_url = cached_url
	break

	if matching_url:
	page_title = mopnd_article_titles[matching_url]
	logger.info(f"📝 Using MOPND cached title (matched normalized URL): {page_title}")
	else:
	logger.warning(f"⚠️ MOPND title not found in cache for URL: {page_url}")
	logger.debug(f"🔍 Available URLs in cache: {list(mopnd_article_titles.keys())[:3]}")
	else:
	# For other sites, extract title from individual page
	title_selector = config.get("title")
	if title_selector:
	try:
	title_element = await page.query_selector(title_selector)
	if title_element:
	page_title = await title_element.text_content()
	if page_title:
	page_title = page_title.strip()
	logger.info(f"📝 Extracted title from page: {page_title}")
	else:
	logger.debug(f"⚠️ Title element found but no text content")
	else:
	logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
	except Exception as e:
	logger.warning(f"⚠️ Error extracting title from page: {str(e)}")

	# Extract PDFs from this page, using page title for PDFs (Approach 2 behavior)
	page_pdfs = await extract_pdfs_from_current_page(page, config, source, start_date, end_date, use_page_title_for_pdfs=True, page_title=page_title)

	if page_pdfs:
	# Check for new (non-duplicate) PDFs
	new_pdfs = []
	for pdf in page_pdfs:
	pdf_url = pdf.get("url", "")
	if pdf_url and pdf_url not in seen_pdf_urls:
	seen_pdf_urls.add(pdf_url)
	new_pdfs.append(pdf)

	if new_pdfs:
	all_pdfs.extend(new_pdfs)
	logger.info(f"📄 Found {len(new_pdfs)} new PDFs on page {i} (total: {len(page_pdfs)} PDFs on page)")
	else:
	logger.info(f"📄 No new PDFs found on page {i} (all {len(page_pdfs)} PDFs were duplicates)")
	else:
	logger.info(f"📄 No PDFs found on page {i}")

	except Exception as e:
	logger.error(f"❌ Error processing page {i} ({page_url}): {str(e)}")
	continue

	logger.info(f"📊 Total unique PDFs found across all pages: {len(all_pdfs)}")

	# Debug: Log the structure of returned PDFs
	if all_pdfs:
	logger.info(f"🔍 Sample PDF structure: {all_pdfs[0]}")
	else:
	logger.warning("⚠️ No PDFs found - this might be the issue")

	return all_pdfs

	except Exception as e:
	logger.error(f"❌ Error in page-links-first approach: {str(e)}")
	return []


	async def check_and_wait_for_recaptcha(page, config: dict) -> bool:
	"""
	Check if recaptcha is present on the page and wait for user to solve it

	Returns:
	True if recaptcha was detected and handled, False otherwise
	"""
	from scraper_common import set_captcha_status, clear_captcha_status

	recaptcha_text = config.get("recaptcha_text")
	if not recaptcha_text:
	return False

	try:
	# Check if recaptcha text appears on the page
	page_content = await page.content()
	if recaptcha_text.lower() in page_content.lower():
	logger.warning(f"🛡️ Recaptcha detected on page: {recaptcha_text}")
	logger.info("⏳ Waiting for user to solve recaptcha (max 60 seconds)...")
	logger.info("💡 Please solve the recaptcha in the browser window")

	# Set captcha status for UI
	set_captcha_status("🛡️ Captcha detected! Please complete the captcha challenge in the browser window. Waiting for you to solve it...")

	# Wait for recaptcha to disappear (text should no longer be on page)
	max_wait_time = 60 # seconds
	wait_interval = 2 # check every 2 seconds
	waited_time = 0

	while waited_time < max_wait_time:
	await asyncio.sleep(wait_interval)
	waited_time += wait_interval

	# Update status message with remaining time
	remaining_time = max_wait_time - waited_time
	set_captcha_status(f"🛡️ Captcha detected! Please complete the captcha challenge in the browser window. Time remaining: {remaining_time}s...")

	# Check if recaptcha text is still present
	current_content = await page.content()
	if recaptcha_text.lower() not in current_content.lower():
	logger.info("✅ Recaptcha appears to be solved, continuing...")
	# Clear captcha status
	clear_captcha_status()
	# Wait a bit more for page to fully load after recaptcha
	await asyncio.sleep(2)
	return True

	logger.debug(f"⏳ Still waiting for recaptcha to be solved... ({waited_time}/{max_wait_time}s)")

	logger.warning(f"⚠️ Recaptcha wait timeout ({max_wait_time}s). Continuing anyway...")
	# Clear captcha status
	clear_captcha_status()
	# Return a special value to indicate captcha timeout
	return "CAPTCHA_TIMEOUT"
	else:
	# No captcha detected, clear any previous status
	clear_captcha_status()

	except Exception as e:
	logger.warning(f"⚠️ Error checking for recaptcha: {str(e)}")
	clear_captcha_status()
	return False

	return False


	async def collect_all_page_links(page, url: str, config: dict, source: str) -> List[str]:
	"""
	Collect all page links through pagination
	"""
	try:
	logger.info("📋 Starting page link collection through pagination")

	# Check if navigation is configured
	navigation_selector = config.get("navigation_selector")
	navigation_url_addition = config.get("navigation_url_addition")
	start_page = config.get("start_page", 1)
	page_links_selector = config.get("page_links")

	if not page_links_selector:
	logger.error("❌ No page_links selector configured")
	return []

	all_page_links = []
	seen_page_urls = set() # Track unique page URLs to detect duplicates
	current_page = start_page
	consecutive_empty_pages = 0
	max_consecutive_empty = 2 # Stop after 2 consecutive pages with no new content

	# Navigate to the initial page
	await page.goto(url, wait_until="domcontentloaded", timeout=30000)

	# Check for recaptcha and wait if present
	captcha_result = await check_and_wait_for_recaptcha(page, config)
	if captcha_result == "CAPTCHA_TIMEOUT":
	logger.error("❌ Captcha detected but not solved within timeout period")
	return [{
	"title": "CAPTCHA_ERROR",
	"content": "Captcha detected. Please try again later. The website requires captcha verification which could not be completed automatically.",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": url
	}]

	# Handle pagination if configured
	if navigation_selector and navigation_url_addition:
	logger.info(f"🧭 Navigation configured: selector={navigation_selector}, url_addition={navigation_url_addition}")
	logger.info(f"📄 Starting from page: {start_page}")

	while True:
	logger.info(f"📄 Collecting page links from page {current_page}")

	# Check MAX_PAGE_LIMIT if set
	if MAX_PAGE_LIMIT is not None and current_page > MAX_PAGE_LIMIT:
	logger.info(f"🛑 Reached MAX_PAGE_LIMIT ({MAX_PAGE_LIMIT}), stopping pagination")
	break

	# Navigate to current page if not the first page
	if current_page > start_page:
	nav_url_addition = navigation_url_addition.replace("{page_no}", str(current_page))
	nav_url = construct_navigation_url(url, nav_url_addition)
	logger.info(f"🧭 Navigating to: {nav_url}")
	await page.goto(nav_url, wait_until="domcontentloaded", timeout=30000)
	# Check for recaptcha and wait if present
	captcha_result = await check_and_wait_for_recaptcha(page, config)
	if captcha_result == "CAPTCHA_TIMEOUT":
	logger.error("❌ Captcha detected but not solved within timeout period")
	return []

	# Check if navigation element exists for next page
	nav_element = await page.query_selector(navigation_selector)
	if current_page == start_page and nav_element:
	logger.info("✅ Navigation element found, more pages available")

	elif current_page > start_page and not nav_element:
	logger.info("📄 No more navigation elements found, stopping pagination")
	break

	# Extract page links from current page
	# Use MOPND-specific function if this is MOPND
	if source == "mopnd":
	page_links = await extract_mopnd_page_links_with_dates(page, config)
	else:
	page_links = await extract_page_links_from_current_page(page, config)

	if page_links:
	# Check for new (non-duplicate) page links
	new_page_links = []
	for page_link in page_links:
	if page_link and page_link not in seen_page_urls:
	seen_page_urls.add(page_link)
	new_page_links.append(page_link)

	if new_page_links:
	all_page_links.extend(new_page_links)
	consecutive_empty_pages = 0 # Reset counter
	logger.info(f"📄 Found {len(new_page_links)} new page links on page {current_page} (total: {len(page_links)} page links on page)")
	else:
	consecutive_empty_pages += 1
	logger.info(f"📄 No new page links found on page {current_page} (all {len(page_links)} page links were duplicates)")

	# Stop if we've had too many consecutive pages with no new content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no new content")
	break
	else:
	consecutive_empty_pages += 1
	logger.info(f"📄 No page links found on page {current_page}")

	# Stop if we've had too many consecutive pages with no content
	if consecutive_empty_pages >= max_consecutive_empty:
	logger.info(f"🛑 Stopping pagination: {consecutive_empty_pages} consecutive pages with no content")
	break

	current_page += 1

	else:
	# No pagination configured, scrape single page only
	logger.info("📄 No navigation configured - collecting page links from single page only")
	# Use MOPND-specific function if this is MOPND
	if source == "mopnd":
	page_links = await extract_mopnd_page_links_with_dates(page, config)
	else:
	page_links = await extract_page_links_from_current_page(page, config)
	all_page_links.extend(page_links)

	logger.info(f"📊 Total unique page links collected: {len(all_page_links)}")
	return all_page_links

	except Exception as e:
	logger.error(f"❌ Error collecting page links: {str(e)}")
	return []


	async def extract_page_links_from_current_page(page, config: dict) -> List[str]:
	"""
	Extract page links from the current page
	"""
	try:
	# Get page links from the page
	page_links = []
	page_links_selector = config.get("page_links")

	if isinstance(page_links_selector, list):
	for selector in page_links_selector:
	logger.info(f"🔍 Looking for page links with selector: {selector}")
	elements = await page.query_selector_all(selector)
	logger.info(f"📰 Found {len(elements)} elements with selector: {selector}")
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	else:
	# If the element itself doesn't have href, look for a link within it or its parent
	# First, try to find an <a> tag within the element
	link_element = await element.query_selector("a")
	if link_element:
	href = await link_element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	continue

	# If no link found within, try to find in parent element
	try:
	parent = await element.evaluate_handle("el => el.parentElement")
	if parent:
	parent_link = await parent.query_selector("a")
	if parent_link:
	href = await parent_link.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	except Exception as e:
	logger.debug(f"⚠️ Could not find link in parent: {str(e)}")
	elif isinstance(page_links_selector, str):
	logger.info(f"🔍 Looking for page links with selector: {page_links_selector}")
	elements = await page.query_selector_all(page_links_selector)
	logger.info(f"📰 Found {len(elements)} elements with selector: {page_links_selector}")
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	else:
	# If the element itself doesn't have href, look for a link within it or its parent
	# First, try to find an <a> tag within the element
	link_element = await element.query_selector("a")
	if link_element:
	href = await link_element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	continue

	# If no link found within, try to find in parent element
	try:
	parent = await element.evaluate_handle("el => el.parentElement")
	if parent:
	parent_link = await parent.query_selector("a")
	if parent_link:
	href = await parent_link.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)
	except Exception as e:
	logger.debug(f"⚠️ Could not find link in parent: {str(e)}")

	logger.info(f"🔗 Found {len(page_links)} page links on current page")
	return page_links

	except Exception as e:
	logger.error(f"❌ Error extracting page links from current page: {str(e)}")
	return []


	async def extract_mopnd_page_links_with_dates(page, config: dict) -> List[str]:
	"""
	Extract MOPND page links with dates and titles (special handling for MOPND)
	"""
	try:
	logger.info("🔍 Extracting MOPND page links with dates and titles")

	# Get page link selector
	page_links_selector = config.get("page_links")
	if not page_links_selector:
	logger.warning("⚠️ No page_links selector found in config")
	return []

	# Get date selector
	date_selector = config.get("date")
	if not date_selector:
	logger.warning("⚠️ No date selector found in config")
	return []

	# Get title selector
	title_selector = config.get("title")
	if not title_selector:
	logger.warning("⚠️ No title selector found in config")
	return []

	# Get all page link elements
	logger.info(f"🔍 Looking for page links with selector: {page_links_selector}")
	link_elements = await page.query_selector_all(page_links_selector)
	logger.info(f"📰 Found {len(link_elements)} page link elements")

	# Get all date elements
	logger.info(f"🔍 Looking for dates with selector: {date_selector}")
	date_elements = await page.query_selector_all(date_selector)
	logger.info(f"📅 Found {len(date_elements)} date elements")

	# Note: For MOPND, title is extracted from link text itself since title selector is same as page_links

	# Extract links, dates, and titles
	page_links = []
	for i, link_element in enumerate(link_elements):
	try:
	# Get the href attribute
	href = await link_element.get_attribute("href")
	if href:
	# Convert to absolute URL
	absolute_url = convert_to_absolute_url(href, page.url)
	page_links.append(absolute_url)

	# Extract title from the link text itself (since title selector is same as page_links)
	try:
	title_text = await link_element.text_content()
	if title_text and title_text.strip():
	# Store the title for this page URL
	mopnd_article_titles[absolute_url] = title_text.strip()
	logger.debug(f"✅ Stored title for {absolute_url}: {title_text.strip()}")
	except Exception as e:
	logger.debug(f"⚠️ Could not extract title from link {i}: {str(e)}")

	# Try to get corresponding date
	# First try by index (assuming same order)
	date_found = False
	if i < len(date_elements):
	try:
	date_text = await date_elements[i].text_content()
	if date_text and date_text.strip():
	# Store the date for this page URL
	mopnd_article_dates[absolute_url] = date_text.strip()
	logger.debug(f"✅ Stored date for {absolute_url}: {date_text.strip()}")
	date_found = True
	except Exception as e:
	logger.debug(f"⚠️ Could not extract date for link {i}: {str(e)}")

	# If date not found by index, try to find it in the same parent container
	if not date_found:
	try:
	# Get the parent element of the link (look for common container classes)
	parent = await link_element.evaluate_handle("el => el.closest('.post_info, .post, [class=\"post\"], [class=\"item\"], [class*=\"entry\"]')")
	if parent:
	# Try to find date element within the same parent
	date_in_parent = await parent.query_selector(date_selector)
	if date_in_parent:
	date_text = await date_in_parent.text_content()
	if date_text and date_text.strip():
	mopnd_article_dates[absolute_url] = date_text.strip()
	logger.debug(f"✅ Stored date from parent container for {absolute_url}: {date_text.strip()}")
	date_found = True
	except Exception as e:
	logger.debug(f"⚠️ Could not find date in parent container: {str(e)}")

	if not date_found:
	logger.warning(f"⚠️ Could not extract date for link {i} ({absolute_url})")

	except Exception as e:
	logger.warning(f"❌ Error extracting link {i}: {str(e)}")
	continue

	logger.info(f"🔗 Found {len(page_links)} MOPND page links with dates and titles")
	logger.info(f"📊 Stored {len(mopnd_article_titles)} titles and {len(mopnd_article_dates)} dates")

	# Debug: Show first few stored titles and dates
	if mopnd_article_titles:
	sample_titles = list(mopnd_article_titles.items())[:3]
	logger.debug(f"🔍 Sample titles: {sample_titles}")
	if mopnd_article_dates:
	sample_dates = list(mopnd_article_dates.items())[:3]
	logger.debug(f"🔍 Sample dates: {sample_dates}")

	return page_links

	except Exception as e:
	logger.error(f"❌ Error extracting MOPND page links: {str(e)}")
	return []


	async def _extract_nbs_pdfs_grouped_by_title(page, config: dict, source: str, start_date: str = None, end_date: str = None) -> List[dict]:
	"""
	Special NBS handler: Multiple titles on one page, each title can have multiple PDFs
	Approach 1: Extract all titles and PDFs, then group PDFs sequentially by title
	"""
	try:
	logger.info(f"🔷 NBS special handling (Approach 1): Processing multiple titles with grouped PDFs")

	# Extract all titles from the page in order
	title_selector = config.get("title")
	titles = []
	if title_selector:
	try:
	title_elements = await page.query_selector_all(title_selector)
	for element in title_elements:
	try:
	title_text = await element.text_content()
	if title_text:
	title_text = title_text.strip()
	titles.append(title_text)
	logger.debug(f"📝 Found title: {title_text}")
	except Exception as e:
	logger.debug(f"⚠️ Could not extract title text: {str(e)}")
	except Exception as e:
	logger.warning(f"⚠️ Error extracting titles: {str(e)}")

	if not titles:
	logger.warning("⚠️ No titles found on NBS page, falling back to standard processing")
	return []

	logger.info(f"📋 Found {len(titles)} titles on page")

	# Extract all PDF links in order
	pdf_selector = config.get("pdf_links")
	all_pdf_links = []
	if isinstance(pdf_selector, list):
	for selector in pdf_selector:
	try:
	elements = await page.query_selector_all(selector)
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	try:
	link_text = await element.text_content()
	pdf_name = link_text.strip() if link_text else ""
	except:
	pdf_name = ""

	if not pdf_name:
	url_path = urlparse(absolute_url).path
	if url_path:
	pdf_name = unquote(os.path.basename(url_path))
	if pdf_name.lower().endswith('.pdf'):
	pdf_name = pdf_name[:-4]

	# Skip PDFs with "Read More" as the name (not actual PDF names)
	if pdf_name and pdf_name.strip().lower() == "read more":
	logger.debug(f"⏭️ Skipping PDF with 'Read More' name: {absolute_url}")
	continue

	all_pdf_links.append({
	"url": absolute_url,
	"name": pdf_name
	})
	except Exception as e:
	logger.debug(f"⚠️ Error with PDF selector '{selector}': {str(e)}")
	elif isinstance(pdf_selector, str):
	try:
	elements = await page.query_selector_all(pdf_selector)
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	try:
	link_text = await element.text_content()
	pdf_name = link_text.strip() if link_text else ""
	except:
	pdf_name = ""

	if not pdf_name:
	url_path = urlparse(absolute_url).path
	if url_path:
	pdf_name = unquote(os.path.basename(url_path))
	if pdf_name.lower().endswith('.pdf'):
	pdf_name = pdf_name[:-4]

	# Skip PDFs with "Read More" as the name (not actual PDF names)
	if pdf_name and pdf_name.strip().lower() == "read more":
	logger.debug(f"⏭️ Skipping PDF with 'Read More' name: {absolute_url}")
	continue

	all_pdf_links.append({
	"url": absolute_url,
	"name": pdf_name
	})
	except Exception as e:
	logger.warning(f"⚠️ Error extracting PDF elements: {str(e)}")

	logger.info(f"🔗 Found {len(all_pdf_links)} PDF links on page")

	if not all_pdf_links:
	logger.warning("⚠️ No PDF links found on NBS page")
	return []

	# Group PDFs by title: Divide PDFs evenly among titles, or use sequential matching
	# Simple approach: Divide PDFs evenly among titles
	pdfs_per_title = len(all_pdf_links) // len(titles) if len(titles) > 0 else 0
	remainder = len(all_pdf_links) % len(titles)

	title_pdf_groups = []
	pdf_index = 0

	for i, title in enumerate(titles):
	# Calculate how many PDFs this title gets
	num_pdfs = pdfs_per_title + (1 if i < remainder else 0)

	# Get PDFs for this title
	title_pdfs = all_pdf_links[pdf_index:pdf_index + num_pdfs]
	pdf_index += num_pdfs

	if title_pdfs:
	title_pdf_groups.append({
	"title": title,
	"pdfs": title_pdfs
	})
	logger.info(f"📋 Title '{title}': {len(title_pdfs)} associated PDFs")

	if not title_pdf_groups:
	logger.warning("⚠️ No title-PDF groups created")
	return []

	# Extract dates from page
	date_selector = config.get("date")
	date_elements = []
	if date_selector:
	try:
	date_elements = await page.query_selector_all(date_selector)
	except Exception as e:
	logger.debug(f"⚠️ Could not extract date elements: {str(e)}")

	# Process each title group: Try all PDFs, if some work, create document
	all_documents = []

	for group_idx, group in enumerate(title_pdf_groups):
	if scraping_cancelled():
	logger.info("🛑 Scraping cancelled, stopping NBS processing")
	break

	if is_pdf_limit_reached():
	logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping NBS processing")
	break

	title = group["title"]
	pdf_list = group["pdfs"]

	logger.info(f"🔷 Processing title {group_idx+1}/{len(title_pdf_groups)}: '{title}' ({len(pdf_list)} PDFs)")

	# Try all PDFs for this title
	successful_pdfs = []
	combined_text_parts = []
	all_pdf_paths = []
	total_size = 0

	for pdf_idx, pdf_info in enumerate(pdf_list):
	if scraping_cancelled():
	break

	if is_pdf_limit_reached():
	break

	pdf_url = pdf_info["url"]
	pdf_link_name = pdf_info.get("name", "") or f"PDF {pdf_idx+1}"

	# Skip PDFs with "Read More" as the name (not actual PDF names)
	if pdf_link_name and pdf_link_name.strip().lower() == "read more":
	logger.info(f" ⏭️ Skipping PDF with 'Read More' name: {pdf_url}")
	continue

	logger.info(f" ⬇️ Trying PDF {pdf_idx+1}/{len(pdf_list)}: {pdf_link_name}")

	try:
	download_result = download_and_save_pdf(pdf_url, source)
	if download_result["success"]:
	local_pdf_path = download_result["path"]
	extracted_text = extract_text_from_pdf_file(local_pdf_path)

	if extracted_text and len(extracted_text.strip()) > 10:
	current_count = increment_global_pdf_count()

	successful_pdfs.append({
	"url": pdf_url,
	"path": local_pdf_path,
	"name": pdf_link_name,
	"size": download_result["size"],
	"text": extracted_text
	})

	combined_text_parts.append(f"=== {pdf_link_name} ===\n{extracted_text}")
	all_pdf_paths.append(local_pdf_path)
	total_size += download_result["size"]

	logger.info(f" ✅ Successfully processed PDF '{pdf_link_name}' (Global: {current_count}/{MAX_PDF_LIMIT})")
	else:
	logger.warning(f" ⚠️ PDF downloaded but no text extracted: {pdf_link_name}")
	else:
	logger.warning(f" ❌ Failed to download PDF: {download_result.get('message', 'Unknown error')}")
	except Exception as e:
	logger.error(f" ❌ Error processing PDF: {str(e)}")
	continue

	# Create document if at least one PDF succeeded (Approach 1: if some work, get PDF)
	if successful_pdfs:
	# Extract date (use first date element or group index if multiple dates)
	pdf_date_raw = ""
	if date_elements:
	date_idx = min(group_idx, len(date_elements) - 1)
	try:
	date_text = await date_elements[date_idx].text_content()
	if date_text:
	pdf_date_raw = date_text.strip()
	except:
	pass

	# Standardize the date to YYYY-MM-DD format
	pdf_date = standardize_date(pdf_date_raw, default_to_current=True)
	if not pdf_date:
	pdf_date = datetime.now().strftime("%Y-%m-%d")

	# Check date range filtering
	if start_date or end_date:
	start_dt = parse_date_input(start_date) if start_date else None
	end_dt = parse_date_input(end_date) if end_date else None
	if not is_date_in_range(pdf_date, start_dt, end_dt, include_missing=False):
	logger.info(f"📅 Title date {pdf_date} is outside date range - skipping")
	continue

	# Combine all PDF texts
	combined_text = "\n\n".join(combined_text_parts)
	primary_path = all_pdf_paths[0] if all_pdf_paths else ""

	all_documents.append({
	"url": successful_pdfs[0]["url"],
	"local_path": primary_path,
	"size": total_size,
	"title": title,
	"source": source,
	"extracted_text": combined_text,
	"file_type": "PDF",
	"date": pdf_date,
	"nbs_pdf_count": len(successful_pdfs),
	"nbs_all_paths": all_pdf_paths
	})

	logger.info(f"✅ Created document for title '{title}' with {len(successful_pdfs)}/{len(pdf_list)} successful PDFs")
	else:
	logger.warning(f"⚠️ No PDFs successfully processed for title: '{title}' - moving forward")

	logger.info(f"📊 NBS Processing Summary: {len(all_documents)} documents created from {len(title_pdf_groups)} titles")
	return all_documents

	except Exception as e:
	logger.error(f"❌ Error in NBS PDF extraction: {str(e)}")
	return []


	async def extract_pdfs_from_current_page(page, config: dict, source: str, start_date: str = None, end_date: str = None, use_page_title_for_pdfs: bool = False, page_title: str = None) -> List[dict]:
	"""
	Extract PDFs from the current page
	Special handling for NBS: Multiple titles on one page, each title can have multiple PDFs

	Args:
	page: Playwright page object
	config: Website configuration dict
	source: Source name
	start_date: Optional start date for filtering
	end_date: Optional end date for filtering
	use_page_title_for_pdfs: If True, use page title for PDFs (Approach 2 behavior)
	page_title: Pre-extracted page title (optional, will extract if not provided and use_page_title_for_pdfs is True)
	"""
	try:
	# Special handling for NBS: Group PDFs by title
	is_nbs = source.lower() in ["nbs", "nbs somalia"]
	if is_nbs:
	return await _extract_nbs_pdfs_grouped_by_title(page, config, source, start_date, end_date)

	# Standard handling for other sources: Each PDF/file gets its own document
	# Get PDF links from the page (with link text for name extraction)
	pdf_links = []
	pdf_selector = config.get("pdf_links")

	if isinstance(pdf_selector, list):
	for selector in pdf_selector:
	elements = await page.query_selector_all(selector)
	for element in elements:
	# Try href first, then button-url (for FEWS custom elements)
	href = await element.get_attribute("href")
	if not href:
	href = await element.get_attribute("button-url")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	# Extract link text for PDF name
	try:
	link_text = await element.text_content()
	pdf_name = link_text.strip() if link_text else ""
	except Exception as e:
	logger.debug(f"⚠️ Could not extract link text: {str(e)}")
	pdf_name = ""

	# If no link text, try to extract filename from URL
	if not pdf_name:
	url_path = urlparse(absolute_url).path
	if url_path:
	pdf_name = unquote(os.path.basename(url_path))
	# Remove .pdf extension if present (we'll add it back if needed)
	if pdf_name.lower().endswith('.pdf'):
	pdf_name = pdf_name[:-4]

	pdf_links.append({
	"url": absolute_url,
	"name": pdf_name,
	"file_type": "PDF"
	})
	elif isinstance(pdf_selector, str):
	elements = await page.query_selector_all(pdf_selector)
	for element in elements:
	# Try href first, then button-url (for FEWS custom elements)
	href = await element.get_attribute("href")
	if not href:
	href = await element.get_attribute("button-url")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	# Extract link text for PDF name
	try:
	link_text = await element.text_content()
	pdf_name = link_text.strip() if link_text else ""
	except Exception as e:
	logger.debug(f"⚠️ Could not extract link text: {str(e)}")
	pdf_name = ""

	# If no link text, try to extract filename from URL
	if not pdf_name:
	from urllib.parse import unquote
	url_path = urlparse(absolute_url).path
	if url_path:
	pdf_name = unquote(os.path.basename(url_path))
	# Remove .pdf extension if present (we'll add it back if needed)
	if pdf_name.lower().endswith('.pdf'):
	pdf_name = pdf_name[:-4]

	pdf_links.append({
	"url": absolute_url,
	"name": pdf_name,
	"file_type": "PDF"
	})

	# Get file links (CSV, etc.) from the page if configured
	file_links = []
	file_selector = config.get("file_links")

	if file_selector:
	# Determine file type from URL or config
	file_type = "CSV" # Default to CSV

	if isinstance(file_selector, list):
	for selector in file_selector:
	elements = await page.query_selector_all(selector)
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	# Determine file type from URL
	if absolute_url.lower().endswith('.csv'):
	file_type = "CSV"
	elif absolute_url.lower().endswith(('.xlsx', '.xls')):
	file_type = "XLSX"
	elif absolute_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
	file_type = "PNG" # Image files
	else:
	file_type = "CSV" # Default

	# Extract link text for file name
	try:
	link_text = await element.text_content()
	file_name = link_text.strip() if link_text else ""
	except Exception as e:
	logger.debug(f"⚠️ Could not extract link text: {str(e)}")
	file_name = ""

	# If no link text, try to extract filename from URL
	if not file_name:
	url_path = urlparse(absolute_url).path
	if url_path:
	file_name = unquote(os.path.basename(url_path))
	# Remove file extension if present
	for ext in ['.csv', '.xlsx', '.xls', '.png', '.jpg', '.jpeg', '.gif', '.webp']:
	if file_name.lower().endswith(ext):
	file_name = file_name[:-len(ext)]
	break

	file_links.append({
	"url": absolute_url,
	"name": file_name,
	"file_type": file_type
	})
	elif isinstance(file_selector, str):
	elements = await page.query_selector_all(file_selector)
	for element in elements:
	href = await element.get_attribute("href")
	if href:
	absolute_url = convert_to_absolute_url(href, page.url)
	# Determine file type from URL
	if absolute_url.lower().endswith('.csv'):
	file_type = "CSV"
	elif absolute_url.lower().endswith(('.xlsx', '.xls')):
	file_type = "XLSX"
	elif absolute_url.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.webp')):
	file_type = "PNG" # Image files
	else:
	file_type = "CSV" # Default

	# Extract link text for file name
	try:
	link_text = await element.text_content()
	file_name = link_text.strip() if link_text else ""
	except Exception as e:
	logger.debug(f"⚠️ Could not extract link text: {str(e)}")
	file_name = ""

	# If no link text, try to extract filename from URL
	if not file_name:
	url_path = urlparse(absolute_url).path
	if url_path:
	file_name = unquote(os.path.basename(url_path))
	# Remove file extension if present
	for ext in ['.csv', '.xlsx', '.xls', '.png', '.jpg', '.jpeg', '.gif', '.webp']:
	if file_name.lower().endswith(ext):
	file_name = file_name[:-len(ext)]
	break

	file_links.append({
	"url": absolute_url,
	"name": file_name,
	"file_type": file_type
	})

	# Combine PDF and file links
	all_links = pdf_links + file_links

	logger.info(f"🔗 Found {len(pdf_links)} PDF links and {len(file_links)} file links on current page (total: {len(all_links)})")

	# Log CSV files specifically for debugging
	csv_files = [link for link in file_links if link.get("file_type") == "CSV"]
	if csv_files:
	logger.info(f"📊 Found {len(csv_files)} CSV file(s) to process:")
	for csv_file in csv_files:
	logger.info(f" - CSV: {csv_file.get('name', 'Unknown')} at {csv_file.get('url', 'Unknown URL')}")

	# Extract page title using the title selector from config (if not already provided)
	if page_title is None:
	page_title = ""
	title_selector = config.get("title")
	if title_selector:
	try:
	title_element = await page.query_selector(title_selector)
	if title_element:
	page_title = await title_element.text_content()
	if page_title:
	page_title = page_title.strip()
	logger.info(f"📝 Extracted page title: {page_title}")
	else:
	logger.debug(f"⚠️ Title element found but no text content")
	else:
	logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
	except Exception as e:
	logger.warning(f"⚠️ Error extracting page title: {str(e)}")
	elif page_title:
	logger.info(f"📝 Using provided page title: {page_title}")

	# Try to extract dates from the page for date filtering
	date_selector = config.get("date")
	date_elements = []
	if date_selector:
	try:
	date_elements = await page.query_selector_all(date_selector)
	logger.debug(f"📅 Found {len(date_elements)} date elements on current page")
	except Exception as e:
	logger.debug(f"⚠️ Could not extract date elements: {str(e)}")

	# Download each PDF/file
	downloaded_pdfs = []
	for i, file_info in enumerate(all_links):
	if scraping_cancelled():
	logger.info("🛑 Scraping cancelled, stopping file downloads")
	break

	# Check global PDF limit before processing
	if is_pdf_limit_reached():
	logger.info(f"🛑 Global PDF limit reached ({MAX_PDF_LIMIT}), stopping file processing")
	break

	file_url = file_info["url"]
	file_name = file_info.get("name", "") # Individual file name from link text
	file_type = file_info.get("file_type", "PDF")

	# Determine title priority based on context
	if use_page_title_for_pdfs and page_title:
	# Approach 2: Use page title for files (when navigating to individual pages)
	file_name = page_title
	logger.info(f"📝 Using page title for {file_type} (Approach 2): {file_name}")
	elif file_name and file_name != "":
	# Approach 1: Priority to individual file link text
	# Clean up the file name (remove extra whitespace, newlines, etc.)
	file_name = " ".join(file_name.split())
	logger.info(f"📝 Using {file_type} link text as name: {file_name}")
	elif page_title:
	# Fallback: Use page title if individual file name is missing
	file_name = page_title
	logger.info(f"📝 Using page title as fallback for {file_type}: {file_name}")
	else:
	# Last resort fallback
	current_count = get_global_pdf_count() + 1
	file_name = f"{file_type} {current_count}"
	logger.info(f"📝 Using fallback name: {file_name}")

	logger.info(f"⬇️ Downloading {file_type} {i+1}/{len(all_links)}: {file_url}")
	logger.info(f"📝 {file_type} name: {file_name}")
	logger.info(f"📊 Global PDF count: {get_global_pdf_count()}/{MAX_PDF_LIMIT}")

	try:
	# Download based on file type
	if file_type == "PDF":
	download_result = download_and_save_pdf(file_url, source)
	else:
	# For CSV and other files
	download_result = download_and_save_file(file_url, source, file_type.lower())

	if download_result["success"]:
	local_file_path = download_result["path"]
	extracted_text = ""

	# Extract text only for PDFs
	if file_type == "PDF":
	logger.info(f"📄 Extracting text from local file: {local_file_path}")
	extracted_text = extract_text_from_pdf_file(local_file_path)
	logger.info(f"📄 Extracted text length: {len(extracted_text)} characters")
	if not extracted_text:
	logger.warning("⚠️ No text extracted from PDF")
	elif file_type == "CSV":
	# Special handling for CSV files: read a preview of the content
	try:
	import csv
	logger.info(f"📄 Reading CSV file preview: {local_file_path}")
	with open(local_file_path, 'r', encoding='utf-8', errors='ignore') as csv_file:
	csv_reader = csv.reader(csv_file)
	# Read first 10 rows as preview
	preview_rows = []
	for idx, row in enumerate(csv_reader):
	if idx >= 10:
	break
	preview_rows.append(row)

	# Convert to text preview
	if preview_rows:
	# Get headers if available
	headers = preview_rows[0] if len(preview_rows) > 0 else []
	data_rows = preview_rows[1:] if len(preview_rows) > 1 else []

	# Extract location from title for icpac_seasonal_forecast
	location_info = ""
	if source == "icpac_seasonal_forecast" and file_name:
	location_info = f"Location: {file_name}\n"

	# Create a readable preview
	preview_text = f"CSV File: {file_name}\n"
	if location_info:
	preview_text += location_info
	preview_text += f"File Path: {local_file_path}\n"
	preview_text += f"Total Rows Previewed: {len(preview_rows)}\n\n"

	if headers:
	preview_text += "Headers: " + ", ".join(str(h) for h in headers) + "\n\n"

	if data_rows:
	preview_text += "Sample Data (first few rows):\n"
	for row in data_rows[:5]: # Show first 5 data rows
	preview_text += ", ".join(str(cell) for cell in row) + "\n"

	extracted_text = preview_text
	logger.info(f"📄 CSV preview extracted: {len(extracted_text)} characters")
	else:
	location_info = ""
	if source == "icpac_seasonal_forecast" and file_name:
	location_info = f"Location: {file_name}\n"
	extracted_text = f"CSV File: {file_name}\n"
	if location_info:
	extracted_text += location_info
	extracted_text += f"File Path: {local_file_path}\n(File is empty or could not be read)"
	logger.warning("⚠️ CSV file appears to be empty")
	except Exception as e:
	logger.warning(f"⚠️ Could not read CSV preview: {str(e)}")
	location_info = ""
	if source == "icpac_seasonal_forecast" and file_name:
	location_info = f"Location: {file_name}\n"
	extracted_text = f"CSV File: {file_name}\n"
	if location_info:
	extracted_text += location_info
	extracted_text += f"File Path: {local_file_path}\n(Preview could not be generated: {str(e)})"
	elif file_type == "PNG":
	# Special handling for PNG files (images) - mention location from title
	location_info = ""
	if source == "icpac_seasonal_forecast" and file_name:
	location_info = f"Location: {file_name}\n"

	extracted_text = f"PNG File: {file_name}\n"
	if location_info:
	extracted_text += location_info
	extracted_text += f"File Path: {local_file_path}\n"
	extracted_text += "(PNG image file downloaded successfully)"
	logger.info(f"📄 PNG file info extracted: {file_name}")
	else:
	# For other file types (XLSX, etc.)
	logger.info(f"📄 {file_type} file downloaded (no text extraction needed)")
	extracted_text = f"{file_type} File: {file_name}\nFile Path: {local_file_path}"

	# Extract date if available from listing page
	file_date_raw = ""
	if source == "mopnd":
	# For MOPND, use the current page URL (not the PDF URL) to look up the date
	current_page_url = page.url
	# Try exact match first
	if current_page_url in mopnd_article_dates:
	file_date_raw = mopnd_article_dates[current_page_url]
	logger.debug(f"✅ Using MOPND date from cache (page URL: {current_page_url}): {file_date_raw}")
	else:
	# Try to find a matching URL (handle query params, trailing slashes)
	page_url_parsed = urlparse(current_page_url)
	page_url_normalized = urlunparse((page_url_parsed.scheme, page_url_parsed.netloc, page_url_parsed.path, '', '', ''))

	# Try normalized URL
	matching_url = None
	for cached_url in mopnd_article_dates.keys():
	cached_parsed = urlparse(cached_url)
	cached_normalized = urlunparse((cached_parsed.scheme, cached_parsed.netloc, cached_parsed.path, '', '', ''))
	if cached_normalized == page_url_normalized:
	matching_url = cached_url
	break

	if matching_url:
	file_date_raw = mopnd_article_dates[matching_url]
	logger.debug(f"✅ Using MOPND date from cache (matched normalized URL): {file_date_raw}")
	else:
	logger.warning(f"⚠️ MOPND date not found in cache for page URL: {current_page_url}")
	logger.debug(f"🔍 Available page URLs in cache: {list(mopnd_article_dates.keys())[:3]}")
	elif i < len(date_elements):
	try:
	date_text = await date_elements[i].text_content()
	if date_text:
	file_date_raw = date_text.strip()
	logger.debug(f"✅ Extracted raw date from listing page: {file_date_raw}")
	except Exception as e:
	logger.debug(f"⚠️ Could not extract date for {file_type} {i+1}: {str(e)}")

	# Standardize the date to YYYY-MM-DD format
	file_date = standardize_date(file_date_raw, default_to_current=True)
	if not file_date:
	file_date = datetime.now().strftime("%Y-%m-%d")

	# Check date range filtering
	if start_date or end_date:
	start_dt = parse_date_input(start_date) if start_date else None
	end_dt = parse_date_input(end_date) if end_date else None
	if not is_date_in_range(file_date, start_dt, end_dt, include_missing=False):
	logger.info(f"📅 {file_type} date {file_date} is outside date range [{start_date}, {end_date}] - filtering out")
	continue

	# Increment global PDF counter
	current_count = increment_global_pdf_count()

	downloaded_pdfs.append({
	"url": file_url,
	"local_path": local_file_path,
	"size": download_result["size"],
	"title": file_name, # Use extracted name from link text
	"source": source,
	"extracted_text": extracted_text,
	"file_type": file_type,
	"date": file_date
	})
	logger.info(f"✅ Successfully downloaded and processed {file_type} '{file_name}' (Global: {current_count}/{MAX_PDF_LIMIT})")
	else:
	logger.warning(f"❌ Failed to download {file_type} {i+1}: {download_result['message']}")
	except Exception as e:
	logger.error(f"❌ Error downloading {file_type} {i+1}: {str(e)}")
	continue

	return downloaded_pdfs

	except Exception as e:
	logger.error(f"❌ Error extracting PDFs from current page: {str(e)}")
	return []

	async def extract_document_content_unified(page, document_url: str, config: dict, website_type: str = None, pdf_count: int = 0, start_date: str = None, end_date: str = None) -> dict:
	"""
	Unified function to extract content from a single document (PDF-focused)
	With 5 retry attempts for loading documents
	"""
	try:
	# Navigate to document with retry logic (5 attempts)
	max_retries = 5
	retry_count = 0
	page_loaded = False

	while retry_count < max_retries and not page_loaded:
	try:
	retry_count += 1
	logger.info(f"🔄 Loading document (attempt {retry_count}/{max_retries}): {document_url}")

	# Navigate with different strategies based on attempt
	if retry_count == 1:
	# First attempt: Use domcontentloaded for faster loading
	await page.goto(document_url, wait_until="domcontentloaded", timeout=30000)
	elif retry_count == 2:
	# Second attempt: Use basic loading
	await page.goto(document_url, timeout=20000)
	elif retry_count == 3:
	# Third attempt: Use networkidle
	await page.goto(document_url, wait_until="networkidle", timeout=15000)
	else:
	# Fourth and fifth attempts: Try with shorter timeouts
	await page.goto(document_url, timeout=10000)

	logger.info(f"✅ Successfully loaded document on attempt {retry_count}")
	page_loaded = True

	except Exception as e:
	logger.warning(f"⚠️ Attempt {retry_count} failed for {document_url}: {str(e)}")

	if retry_count >= max_retries:
	logger.error(f"❌ Failed to load document after {max_retries} attempts: {document_url}")
	return {
	"title": "Network Error",
	"content": f"Failed to access document after {max_retries} attempts: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": document_url
	}

	# Wait before retry
	await asyncio.sleep(2)

	if not page_loaded:
	return {
	"title": "Network Error",
	"content": f"Failed to access document after {max_retries} attempts",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": document_url
	}

	# Extract title from page using title selector (priority source)
	title = ""
	title_extracted_from_page = False

	# For MOPND, use the title extracted from the main page
	if website_type == "mopnd" and document_url in mopnd_article_titles:
	title = mopnd_article_titles[document_url]
	title_extracted_from_page = True
	logger.debug(f"✅ Using MOPND title from main page: {title}")
	elif website_type == "mopnd":
	logger.warning(f"⚠️ MOPND title not found in cache for URL: {document_url}")
	logger.debug(f"🔍 Available titles: {list(mopnd_article_titles.keys())[:3]}")
	else:
	# Regular title extraction for other websites using title selector from config
	title_selector = config.get("title")
	if title_selector:
	try:
	title_element = await page.query_selector(title_selector)
	if title_element:
	title = await title_element.text_content()
	if title:
	title = title.strip()
	title_extracted_from_page = True
	logger.info(f"✅ Extracted title from page using selector '{title_selector}': {title}")
	else:
	logger.debug(f"⚠️ Title element found but no text content with selector: {title_selector}")
	else:
	logger.debug(f"⚠️ Title element not found with selector: {title_selector}")
	except Exception as e:
	logger.warning(f"Error extracting title with selector '{title_selector}': {str(e)}")
	else:
	logger.warning("⚠️ No title selector found in config")

	# Use the passed website_type or try to determine it from config
	if website_type is None:
	for site_type, site_config in WEBSITE_CONFIG.items():
	if site_config == config:
	website_type = site_type
	break
	if website_type is None:
	website_type = "unknown"

	content = ""
	pdf_path = ""

	# For document-focused sites, check for PDF links
	# Dynamically determine if this is a PDF website
	pdf_websites = get_pdf_websites()
	if website_type in pdf_websites:
	pdf_links = []
	try:
	# Get PDF selectors from config
	pdf_links_selector = config.get("pdf_links")

	# Initialize elements list
	pdf_elements = []

	# Handle different formats in config
	if isinstance(pdf_links_selector, list):
	# Process each selector in the array
	logger.info(f"🔍 Processing array of {len(pdf_links_selector)} PDF selectors")
	for selector in pdf_links_selector:
	try:
	elements = await page.query_selector_all(selector)
	logger.info(f"📂 Found {len(elements)} elements with selector {selector}")
	pdf_elements.extend(elements)
	except Exception as e:
	logger.warning(f"❌ Error with selector '{selector}': {str(e)}")
	elif isinstance(pdf_links_selector, str):
	# Old format with single string selector
	logger.info(f"🔍 Using string selector: {pdf_links_selector}")
	pdf_elements = await page.query_selector_all(pdf_links_selector)
	else:
	logger.warning("⚠️ No pdf_links selector in config, skipping PDF extraction")

	# Extract PDF URLs and names from elements
	logger.debug(f"🔍 Processing {len(pdf_elements)} PDF elements for {website_type}")
	for i, element in enumerate(pdf_elements):
	try:
	logger.debug(f"🔗 Extracting PDF URL from element {i+1}/{len(pdf_elements)}")

	# Get the href attribute, or button-url for FEWS custom elements
	href = await element.get_attribute("href")
	if not href:
	href = await element.get_attribute("button-url")
	if href:
	# Convert relative URLs to absolute URLs
	absolute_url = convert_to_absolute_url(href, page.url)

	# Extract link text for PDF name
	try:
	link_text = await element.text_content()
	pdf_name = link_text.strip() if link_text else ""
	except Exception as e:
	logger.debug(f"⚠️ Could not extract link text: {str(e)}")
	pdf_name = ""

	# If no link text, try to extract filename from URL
	if not pdf_name:
	from urllib.parse import unquote
	url_path = urlparse(absolute_url).path
	if url_path:
	pdf_name = unquote(os.path.basename(url_path))
	# Remove .pdf extension if present
	if pdf_name.lower().endswith('.pdf'):
	pdf_name = pdf_name[:-4]

	pdf_links.append({
	"url": absolute_url,
	"name": pdf_name
	})
	logger.info(f"🔗 Found PDF URL: {absolute_url}")
	if pdf_name:
	logger.info(f"📝 PDF name: {pdf_name}")
	else:
	logger.debug(f"⚠️ No href or button-url attribute found on element {i+1}")

	except Exception as e:
	logger.warning(f"❌ Error extracting PDF URL from element {i+1}: {str(e)}")
	continue
	except Exception as e:
	logger.warning(f"Error extracting PDF links: {str(e)}")
	pdf_links = []

	if pdf_links:
	logger.info(f"📄 Found {len(pdf_links)} PDF links, processing...")
	# Process all PDF links (up to limit)
	pdf_content_parts = []
	for i, pdf_info in enumerate(pdf_links):
	if pdf_count >= MAX_PDF_LIMIT and MAX_PDF_LIMIT is not None:
	logger.info(f"📄 Reached PDF limit ({MAX_PDF_LIMIT}), stopping PDF processing")
	break

	# Handle both old format (string) and new format (dict)
	if isinstance(pdf_info, dict):
	pdf_url = pdf_info["url"]
	pdf_name = pdf_info.get("name", "")
	else:
	# Backward compatibility: if it's still a string
	pdf_url = pdf_info
	pdf_name = ""

	try:
	logger.info(f"📄 Processing PDF {i+1}/{len(pdf_links)}: {pdf_url}")
	if pdf_name:
	logger.info(f"📝 PDF name: {pdf_name}")

	# First try to download the PDF to get the local path
	download_result = download_and_save_pdf(pdf_url, website_type)
	if download_result["success"]:
	# Set the PDF path to the local downloaded file
	pdf_path = download_result["path"]
	logger.info(f"📁 PDF downloaded to: {pdf_path}")

	# Now extract text from the downloaded PDF
	pdf_content = extract_text_from_pdf_file(pdf_path)

	if pdf_content and len(pdf_content.strip()) > 10:
	# Use extracted PDF name if available, otherwise use generic label
	pdf_label = pdf_name if pdf_name else f"PDF {i+1}"
	pdf_content_parts.append(f"{pdf_label} Content:\n{pdf_content}")
	logger.info(f"✅ Extracted {len(pdf_content)} characters from {pdf_label}")

	# Only use PDF name as title if page title extraction completely failed
	# Priority: page title selector > PDF name > PDF content
	if pdf_name and not title_extracted_from_page and not title:
	title = pdf_name
	logger.info(f"📝 Using PDF name as title (page title extraction failed): {title}")
	else:
	logger.warning(f"⚠️ No content extracted from PDF {i+1}")
	else:
	logger.warning(f"❌ Failed to download PDF {i+1}: {download_result['message']}")

	pdf_count += 1
	logger.info(f"📄 PDF {pdf_count}/{MAX_PDF_LIMIT} processed")

	except Exception as e:
	logger.warning(f"❌ Error processing PDF {i+1}: {str(e)}")
	continue

	# Combine all PDF content
	if pdf_content_parts:
	content = "\n\n".join(pdf_content_parts)
	logger.info(f"📄 Combined PDF content: {len(content)} characters total")

	# Only extract title from PDF content as absolute last resort
	# Priority: page title selector > PDF name > PDF content
	if not title_extracted_from_page and not title and content and len(content) > 50:
	lines = content.split('\n')[:5]
	for line in lines:
	if line.strip() and len(line.strip()) > 10 and len(line.strip()) < 100:
	title = line.strip()
	logger.info(f"📝 Using title extracted from PDF content (page title extraction failed): {title}")
	break
	else:
	logger.warning("⚠️ No PDF content extracted, skipping document")
	content = ""
	else:
	# No PDF links found, skip document
	logger.info("📄 No PDF links found, skipping document")
	content = ""

	# Extract date using configuration selector
	date_raw = ""

	# For MOPND, use the date extracted from the main page
	if website_type == "mopnd" and document_url in mopnd_article_dates:
	date_raw = mopnd_article_dates[document_url]
	logger.debug(f"✅ Using MOPND date from main page: {date_raw}")
	elif website_type == "mopnd":
	logger.warning(f"⚠️ MOPND date not found in cache for URL: {document_url}")
	logger.debug(f"🔍 Available dates: {list(mopnd_article_dates.keys())[:3]}")
	else:
	# Regular date extraction for other websites
	date_selector = config.get("date")

	if date_selector:
	try:
	date_element = await page.query_selector(date_selector)
	if date_element:
	date_raw = await date_element.text_content()
	if date_raw:
	date_raw = date_raw.strip()
	logger.debug(f"✅ Extracted raw date: {date_raw}")
	except Exception as e:
	logger.warning(f"Error extracting date with selector {date_selector}: {str(e)}")

	# Standardize the date to YYYY-MM-DD format
	date = standardize_date(date_raw, default_to_current=True)
	if not date:
	date = datetime.now().strftime("%Y-%m-%d")
	logger.info(f"No date found with config selector, using current date: {date}")

	# Check date range filtering
	if start_date or end_date:
	start_dt = parse_date_input(start_date) if start_date else None
	end_dt = parse_date_input(end_date) if end_date else None
	if not is_date_in_range(date, start_dt, end_dt, include_missing=False):
	logger.info(f"📅 Document date {date} is outside date range [{start_date}, {end_date}] - filtering out")
	return None

	# Skip documents with no content (for PDF-based sites)
	# Dynamically determine if this is a PDF website
	pdf_websites = get_pdf_websites()
	if website_type in pdf_websites:
	if not content or len(content.strip()) < 10:
	logger.info(f"📄 Skipping document with no PDF content: {document_url}")
	return None

	result = {
	"title": title or "No title found",
	"content": content or "No content found",
	"date": date,
	"url": document_url
	}

	# Add PDF path for PDF-based sites
	# Dynamically determine if this is a PDF website
	pdf_websites = get_pdf_websites()
	if website_type in pdf_websites:
	if pdf_path:
	result["pdf_path"] = pdf_path
	logger.info(f"📁 Added PDF path to result: {pdf_path}")
	else:
	logger.warning("⚠️ No PDF path available for PDF-based site")

	return result

	except Exception as e:
	logger.error(f"Error extracting content from {document_url}: {str(e)}")
	return {
	"title": "Error",
	"content": f"Error extracting content: {str(e)}",
	"date": datetime.now().strftime("%Y-%m-%d"),
	"url": document_url
	}