Spaces:

vanderbilt-dsi
/

survey-analytics

Sleeping

App Files Files Community

umangchaudhry commited on Nov 7

Commit

cc2626e

verified ·

1 Parent(s): fc421eb

Upload 20 files

Browse files

Files changed (14) hide show

app.py +98 -305
config.py +130 -0
crosstab_rag.py +418 -584
prompts/crosstab_rag_prompt_system.txt +11 -5
prompts/relevance_check_prompt.txt +93 -0
prompts/research_brief_prompt.txt +177 -8
prompts/synthesis_prompt_system.txt +12 -3
prompts/synthesis_prompt_user.txt +16 -4
questionnaire_rag.py +113 -483
relevance_checker.py +248 -0
survey_agent.py +0 -0
toplines_rag.py +131 -160
toplines_vectorstores/poll_catalog_toplines.json +43 -3
toplines_vectorstores/toplines_index.json +0 -0

app.py CHANGED Viewed

@@ -1,336 +1,129 @@
 """
-Gradio interface for Survey Analysis Agent
-Host on Hugging Face Spaces
 """
 import os
-import gradio as gr
 from survey_agent import SurveyAnalysisAgent
-import uuid
-from datetime import datetime
-# Initialize agent (will be done once at startup)
-agent = None
-initialization_error = None
-def initialize_agent():
-    """Initialize the agent with API keys from environment"""
-    global agent, initialization_error
-    try:
-        openai_api_key = os.getenv("OPENAI_API_KEY")
-        pinecone_api_key = os.getenv("PINECONE_API_KEY")
-        if not openai_api_key:
-            initialization_error = "❌ OPENAI_API_KEY not found. Please set it in Space Settings → Repository Secrets."
-            return False
-        if not pinecone_api_key:
-            initialization_error = "❌ PINECONE_API_KEY not found. Please set it in Space Settings → Repository Secrets."
-            return False
-        # Check if vector store exists
-        if not os.path.exists("./questionnaire_vectorstores"):
-            initialization_error = "❌ Vector store directory not found. Please upload the questionnaire_vectorstores folder."
-            return False
-        agent = SurveyAnalysisAgent(
-            openai_api_key=openai_api_key,
-            pinecone_api_key=pinecone_api_key,
-            verbose=False  # Set to False for cleaner UI
-        )
-        return True
-    except Exception as e:
-        initialization_error = f"❌ Initialization error: {str(e)}"
-        return False
-def chat_with_streaming(message, history, session_id):
-    """
-    Stream response for better UX
-    Args:
-        message: User's message
-        history: Chat history (not used in streaming)
-        session_id: Unique session identifier for conversation memory
-    Yields:
-        Partial responses as they become available
-    """
-    if initialization_error:
-        yield initialization_error
-        return
-    if not agent:
-        yield "⚠️ Agent not initialized. Please refresh the page."
-        return
-    if not message.strip():
-        return
-    try:
-        # Show that we're processing with a distinctive format
-        yield "⏳ **Processing your request...**\n\n🤔 Analyzing your question..."
-        # Debug: Check if stream_query exists
-        if not hasattr(agent, 'stream_query'):
-            print("⚠️ WARNING: agent.stream_query() not found, falling back to regular query")
-            yield agent.query(message, thread_id=session_id)
-            return
-        # Define the workflow stages
-        stages = {
-            "generate_research_brief": {"icon": "📋", "text": "Planning research strategy", "step": 1},
-            "execute_stage": {"icon": "📊", "text": "Retrieving data from surveys", "step": 2},
-            "extract_stage_context": {"icon": "🔗", "text": "Processing retrieved data", "step": 3},
-            "synthesize_response": {"icon": "✍️", "text": "Synthesizing answer", "step": 4}
-        }
-        total_steps = 4
-        # Stream events from agent
-        has_answer = False
-        event_count = 0
-        current_step = 0
-        for event in agent.stream_query(message, thread_id=session_id):
-            event_count += 1
-            print(f"📡 Stream event {event_count}: {list(event.keys()) if event else 'None'}")
-            if not event:
-                continue
-            # Get current node
-            node = list(event.keys())[0]
-            print(f"   Processing node: {node}")
-            # Build progress display
-            if node in stages:
-                stage_info = stages[node]
-                current_step = stage_info['step']
-                # Calculate percentage
-                percentage = int((current_step / total_steps) * 100)
-                # Create a clean progress indicator
-                progress_display = f"### ⏳ Processing your request... ({percentage}%)\n\n"
-                progress_display += f"> **Current step:** {stage_info['icon']} {stage_info['text']}\n\n"
-                yield progress_display
-            # Check for final answer
-            if node == "synthesize_response":
-                # Get final answer
-                state = event[node]
-                final_answer = state.get("final_answer")
-                if final_answer:
-                    print(f"   Got final answer ({len(final_answer)} chars)")
-                    yield final_answer
-                    has_answer = True
-                    return
-        print(f"📡 Stream complete. Total events: {event_count}, Has answer: {has_answer}")
-        # Fallback if streaming didn't provide answer
-        if not has_answer:
-            print("⚠️ No answer from streaming, using regular query")
-            yield agent.query(message, thread_id=session_id)
-    except Exception as e:
-        error_msg = f"❌ Error processing query: {str(e)}"
-        print(f"Error details: {e}")
-        import traceback
-        traceback.print_exc()
-        yield error_msg
-def create_new_session():
-    """Create a new session ID"""
-    return str(uuid.uuid4())
-def get_available_surveys():
-    """Get list of available surveys"""
-    if initialization_error or not agent:
-        return "Agent not initialized"
     try:
-        surveys = agent.questionnaire_rag.get_available_survey_names()
-        polls = agent.questionnaire_rag.get_available_polls()
-        info = "### Available Surveys\n\n"
-        info += f"**{', '.join(surveys)}**\n\n"
-        info += "### Available Time Periods\n\n"
-        # Group by year
-        by_year = {}
-        for poll in polls:
-            year = poll['year']
-            if year not in by_year:
-                by_year[year] = []
-            by_year[year].append(poll)
-        for year in sorted(by_year.keys(), reverse=True):
-            info += f"**{year}:**\n"
-            for poll in sorted(by_year[year], key=lambda x: x['month']):
-                info += f"- {poll['month']} ({poll['num_questions']} questions)\n"
-            info += "\n"
-        return info
     except Exception as e:
-        return f"Error retrieving survey info: {str(e)}"
-# Initialize agent at startup
-print("🚀 Initializing Survey Analysis Agent...")
-init_success = initialize_agent()
-if init_success:
-    print("✅ Agent initialized successfully!")
-else:
-    print(f"⚠️ Agent initialization failed: {initialization_error}")
-# Create Gradio interface with modern chat-first design
-with gr.Blocks(title="Survey Analysis Agent", theme=gr.themes.Soft()) as demo:
-    # Session state
-    session_id_state = gr.State(value=create_new_session())
-    # Main layout: chat takes priority
-    with gr.Row():
-        with gr.Column(scale=3):
-            # Header
-            gr.Markdown("""
-            # 📊 Survey Analysis Agent
-            Ask questions about Vanderbilt Unity Poll data using natural language.
-            I can analyze questions, response frequencies, and demographic breakdowns across multiple time periods.
-            """)
-            # Show initialization status if there's an error
-            if initialization_error:
-                gr.Markdown(f"## ⚠️ Setup Required\n\n{initialization_error}")
-            # Main chat interface
-            chatbot = gr.Chatbot(
-                label="",
-                height=500,
-                show_label=False,
-                type="messages",
-                placeholder="Ask me anything about the survey data..."
-            )
-            with gr.Row():
-                msg = gr.Textbox(
-                    label="",
-                    placeholder="e.g., What questions about the economy were asked in June 2025?",
-                    show_label=False,
-                    scale=9,
-                    container=False
-                )
-                submit = gr.Button("Send", scale=1, variant="primary")
-            with gr.Row():
-                clear = gr.Button("🔄 New Conversation", size="sm")
-            # Example questions
-            gr.Examples(
-                examples=[
-                    "What questions were asked in June 2025?",
-                    "Show me Trump's approval ratings in 2025",
-                    "What questions about the economy were asked in 2025?",
-                    "How do responses about immigration vary by political party?",
-                    "Compare healthcare questions from February and June 2025",
-                ],
-                inputs=msg,
-                label="💡 Example Questions"
-            )
-        # Collapsible sidebar with info
-        with gr.Column(scale=1):
-            with gr.Accordion("📋 Available Data", open=False):
-                survey_info = gr.Markdown(
-                    value=get_available_surveys() if init_success else "Agent not initialized",
-                )
-                refresh_info = gr.Button("🔄 Refresh", size="sm")
-            with gr.Accordion("🎯 What I Can Do", open=False):
-                gr.Markdown("""
-                **📝 Questionnaires**
-                - Question text & options
-                - Topics and themes
-                - Skip logic & sampling
-                - Question sequencing
-                **📊 Response Data**
-                - Overall percentages
-                - Demographic breakdowns
-                - Cross-tabulations
-                - Time comparisons
-                """)
-            with gr.Accordion("💡 Tips", open=False):
-                gr.Markdown("""
-                - Specify time periods when relevant
-                - Ask follow-up questions for more detail
-                - I maintain conversation context
-                - Request comparisons across time periods
-                """)
-            with gr.Accordion("🔧 Current Status", open=False):
-                gr.Markdown("""
-                ✅ Questionnaire data
-                ✅ Toplines (response %)
-                ✅ Crosstabs (demographics)
-                ⏳ SQL queries (coming soon)
-                """)
-    # Footer
-    gr.Markdown("""
-    ---
-    💬 **Conversation Memory:** I remember our conversation history, so feel free to ask follow-up questions
-    or reference previous queries (e.g., "Show me the crosstabs for those questions").
-    """)
-    # Event handlers
-    def respond(message, chat_history, session_id):
-        """Handle message with streaming updates"""
-        if not message.strip():
-            return chat_history, ""
-        # Add user message
-        chat_history.append({"role": "user", "content": message})
-        # Add placeholder for assistant response
-        chat_history.append({"role": "assistant", "content": ""})
-        # Stream updates
-        for partial_response in chat_with_streaming(message, chat_history, session_id):
-            chat_history[-1]["content"] = partial_response
-            yield chat_history, ""
-        # Final return
-        yield chat_history, ""
-    def clear_chat():
-        """Clear chat and create new session"""
-        new_session = create_new_session()
-        return [], new_session
-    # Wire up events
-    msg.submit(respond, [msg, chatbot, session_id_state], [chatbot, msg])
-    submit.click(respond, [msg, chatbot, session_id_state], [chatbot, msg])
-    clear.click(clear_chat, None, [chatbot, session_id_state])
-    refresh_info.click(get_available_surveys, None, survey_info)
-# Launch the app
 if __name__ == "__main__":
     demo.launch(
-        server_name="0.0.0.0",
         server_port=7860,
-        share=False
     )

 """
+Gradio ChatInterface for Survey Agent V2 - Simplified Version
+Uses ChatInterface to avoid API generation bugs
 """
 import os
+import sys
+from pathlib import Path
+# Add parent directory to path for imports
+sys.path.append(str(Path(__file__).parent))
 from survey_agent import SurveyAnalysisAgent
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+import gradio as gr
+# Global agent
+agent = None
+def initialize_agent():
+    """Initialize the survey analysis agent"""
+    global agent
+    if agent is not None:
+        return agent
+    openai_api_key = os.getenv("OPENAI_API_KEY")
+    pinecone_api_key = os.getenv("PINECONE_API_KEY")
+    if not openai_api_key or not pinecone_api_key:
+        raise ValueError("Missing API keys")
+    print("Initializing Survey Analysis Agent...")
+    agent = SurveyAnalysisAgent(
+        openai_api_key=openai_api_key,
+        pinecone_api_key=pinecone_api_key,
+        verbose=True
+    )
+    print("✅ Agent initialized!")
+    return agent
+def respond(message, history):
+    """Process user message and return bot response"""
+    global agent
+    # Initialize agent if needed
+    if agent is None:
+        try:
+            agent = initialize_agent()
+        except Exception as e:
+            return f"⚠️ Error: {str(e)}"
     try:
+        # Use a default thread ID
+        thread_id = "gradio_session"
+        response = agent.query(message, thread_id=thread_id)
+        return response
     except Exception as e:
+        return f"❌ Error: {str(e)}\n\nPlease try rephrasing your question."
+# Create the interface
+print("Creating Gradio interface...")
+# Create a custom chatbot with larger height
+chatbot = gr.Chatbot(
+    height=650,  # Increased height for better readability
+    show_copy_button=True,  # Allow copying responses
+)
+demo = gr.ChatInterface(
+    respond,
+    chatbot=chatbot,
+    title="🗳️ Vanderbilt Unity Poll Survey Agent",
+    description="""
+    ### AI-Powered Analysis of Survey Data
+    Ask questions about American public opinion using natural language.
+    The system will search through survey data and provide comprehensive answers.
+    **Example questions:**
+    - What do Americans think about immigration in June 2025?
+    - How has Biden's approval rating changed over time?
+    - Show me views on the economy by political party
+    - Break that down by gender
+    **Available data:**
+    - 9 polls from 2023-2025
+    - 125 questions across topics like immigration, economy, healthcare, etc.
+    - Demographic breakdowns by party, gender, age, and more
+    """,
+    examples=[
+        "What do Americans think about immigration in June 2025?",
+        "How has Biden's approval rating changed?",
+        "Show me views on the economy by political party",
+    ],
+    theme=gr.themes.Soft(),
+    retry_btn=None,
+    undo_btn=None,
+    clear_btn="Clear Chat",
+)
 if __name__ == "__main__":
+    print("\nLaunching Gradio interface...")
+    print("The interface will open at http://127.0.0.1:7860")
+    print("\nPress Ctrl+C to stop.\n")
+    # Pre-initialize the agent
+    try:
+        initialize_agent()
+    except Exception as e:
+        print(f"⚠️ Warning: {e}")
     demo.launch(
+        server_name="127.0.0.1",
         server_port=7860,
+        share=False,
+        show_error=True
     )

config.py ADDED Viewed

	@@ -0,0 +1,130 @@

+"""
+Configuration constants for Survey Agent V2
+"""
+# Valid topics that exist in the questionnaire vectorstore metadata
+# These are the only topics that can be used for metadata filtering
+VALID_TOPICS = {
+    "biden_administration",
+    "confidence_institutions",
+    "economy",
+    "education",
+    "elections",
+    "foreign_policy",
+    "general",
+    "healthcare",
+    "immigration",
+    "judicial",
+    "technology",
+    "trump_administration",
+}
+# Topic mapping for common variations/synonyms
+TOPIC_MAPPINGS = {
+    # Immigration variations
+    "deportation": "immigration",
+    "deporting": "immigration",
+    "border": "immigration",
+    "visa": "immigration",
+    "visas": "immigration",
+    "undocumented": "immigration",
+    "illegal immigration": "immigration",
+    # Economy variations
+    "tariffs": "economy",
+    "tariff": "economy",
+    "finances": "economy",
+    "financial": "economy",
+    "stock market": "economy",
+    "inflation": "economy",
+    # Education variations
+    "college": "education",
+    "colleges": "education",
+    "university": "education",
+    "universities": "education",
+    "school": "education",
+    "schools": "education",
+    # Healthcare variations
+    "health": "healthcare",
+    "medical": "healthcare",
+    "wellness": "healthcare",
+    # Technology variations
+    "ai": "technology",
+    "artificial intelligence": "technology",
+    "innovation": "technology",
+    # Elections variations
+    "voting": "elections",
+    "vote": "elections",
+    "electoral": "elections",
+    "candidate": "elections",
+    "candidates": "elections",
+    # Trump variations
+    "trump": "trump_administration",
+    "maga": "trump_administration",
+    # Biden variations
+    "biden": "biden_administration",
+    # Judicial variations
+    "court": "judicial",
+    "courts": "judicial",
+    "judge": "judicial",
+    "judges": "judicial",
+    "ruling": "judicial",
+    "rulings": "judicial",
+    # Foreign policy variations
+    "china": "foreign_policy",
+    "international": "foreign_policy",
+    "foreign": "foreign_policy",
+    # Confidence variations
+    "confidence": "confidence_institutions",
+    "trust": "confidence_institutions",
+    "institutions": "confidence_institutions",
+}
+def normalize_topic(topic: str) -> str:
+    """
+    Normalize a topic string to a valid topic.
+    Args:
+        topic: The topic to normalize (case-insensitive)
+    Returns:
+        Normalized topic if valid/mappable, else 'general'
+    """
+    if not topic:
+        return "general"
+    topic_lower = topic.lower().strip()
+    # Check if it's already a valid topic
+    if topic_lower in VALID_TOPICS:
+        return topic_lower
+    # Check if it can be mapped
+    if topic_lower in TOPIC_MAPPINGS:
+        return TOPIC_MAPPINGS[topic_lower]
+    # Check for partial matches (e.g., "trump administration" → "trump_administration")
+    for valid_topic in VALID_TOPICS:
+        if topic_lower.replace("_", " ") == valid_topic.replace("_", " "):
+            return valid_topic
+        if topic_lower in valid_topic or valid_topic in topic_lower:
+            return valid_topic
+    # If no match, return general (will use semantic search)
+    return "general"
+def is_valid_topic(topic: str) -> bool:
+    """Check if a topic is valid for metadata filtering"""
+    return topic.lower().strip() in VALID_TOPICS

crosstab_rag.py CHANGED Viewed

@@ -1,155 +1,221 @@
-#!/usr/bin/env python3
 """
-rag_crosstab_query.py
-Full Crosstab RAG pipeline:
- - Parse user query for survey/year/month/topic
- - Use QuestionnaireRAG to find matching questions (reuses existing vectorstore)
- - Extract variable names from matched questions
- - Query Pinecone within the appropriate namespace (survey crosstabs namespace)
- - Collect all parts for the matched question(s)
- - Summarize with the LLM, cite source filenames/part ids
 """
 import os
-import re
-import argparse
 from typing import List, Dict, Optional, Any
 from pathlib import Path
 from dotenv import load_dotenv
-from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain.schema import Document
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
-# Import QuestionnaireRAG to reuse existing question matching
-from questionnaire_rag import QuestionnaireRAG
 load_dotenv()
-def _load_prompt_file(filename: str) -> str:
-    """Load a prompt file from the prompts directory"""
-    prompt_dir = Path(__file__).parent / "prompts"
-    prompt_path = prompt_dir / filename
-    if not prompt_path.exists():
-        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
-    return prompt_path.read_text(encoding="utf-8")
-# -------------------------
-# Config / Environment
-# -------------------------
-OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
-PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_CROSSTABS")
-PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME_CROSSTABS", "crosstab-index")
-if not OPENAI_API_KEY:
-    raise ValueError("OPENAI_API_KEY environment variable not set")
-if not PINECONE_API_KEY:
-    raise ValueError("PINECONE_API_KEY_CROSSTABS environment variable not set")
-EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
-LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
 PINECONE_RETRIEVE_K = 100
 MAX_CROSSTAB_CHUNKS = 50
-# -------------------------
-# Utilities
-# -------------------------
-def extract_year_month_poll(query: str) -> Dict[str, Optional[str]]:
-    out = {"year": None, "month": None, "poll": None}
-    q = query.lower()
-    ym = re.search(r"\b(20\d{2})\b", q)
-    if ym:
-        out["year"] = ym.group(1)
-    months = ["january","february","march","april","may","june",
-              "july","august","september","october","november","december"]
-    for m in months:
-        if m in q:
-            out["month"] = m.capitalize()
-            break
-    if not out["month"]:
-        if any(word in q for word in ["recent", "latest", "current", "now"]):
-            out["month"] = "June"
-            if not out["year"]:
-                out["year"] = "2025"
-    if "vanderbilt" in q or "unity" in q:
-        out["poll"] = "Vanderbilt_Unity_Poll"
-    return out
-# -------------------------
-# Pinecone retrieval + assembly
-# -------------------------
 class CrosstabRetriever:
-    def __init__(self,
-                 pinecone_api_key: str = PINECONE_API_KEY,
-                 index_name: str = PINECONE_INDEX_NAME,
-                 embed_model: str = EMBED_MODEL,
-                 openai_api_key: str = OPENAI_API_KEY,
-                 verbose: bool = False):
         self.pc = Pinecone(api_key=pinecone_api_key)
         self.index_name = index_name
         self.embedder = OpenAIEmbeddings(model=embed_model, openai_api_key=openai_api_key)
         self.verbose = verbose
-    def _make_vectorstore(self, namespace: str) -> PineconeVectorStore:
-        index = self.pc.Index(self.index_name)
-        return PineconeVectorStore(index=index, embedding=self.embedder, namespace=namespace)
-    def retrieve_parts_for_variable(self, namespace: str, variable_prefix: str, user_query: str = None, k: int = PINECONE_RETRIEVE_K) -> List[Document]:
         """
-        Retrieve crosstab chunks for a specific variable using direct metadata filtering.
-        Since we already know the exact variable name from QuestionnaireRAG, we use
-        Pinecone metadata filtering instead of semantic search for better accuracy and speed.
         Args:
-            namespace: Pinecone namespace (e.g., "Vanderbilt_Unity_Poll_2025_February_cleaned_data_crosstabs")
-            variable_prefix: Exact variable name (e.g., "VAND15")
-            user_query: Not used anymore, kept for backward compatibility
-            k: Maximum number of chunks to retrieve (not really needed with exact filtering)
         Returns:
-            List of Document objects with crosstab data for the variable
         """
         try:
             index = self.pc.Index(self.index_name)
             stats = index.describe_index_stats()
-            namespaces = stats.get('namespaces', {})
-            if namespace not in namespaces:
-                return []
-        except Exception:
-            return []
-        # Clean variable name - the CSV filename is like "VAND15_crosstab.csv"
-        # So the variable_name stored is "VAND15_crosstab" (from csv_file.stem)
-        # But QuestionnaireRAG returns "VAND15"
-        # We need to match both formats
-        base_variable = variable_prefix.replace("_crosstab", "").split("_")[0]
-        variable_with_suffix = f"{base_variable}_crosstab"
-        if self.verbose:
-            print(f"   🔍 Looking for variable: '{base_variable}' or '{variable_with_suffix}' in namespace: '{namespace}'")
-        # Use Pinecone metadata filtering for exact match
-        # Try both formats: "VAND15" and "VAND15_crosstab"
-        try:
-            # Pinecone supports $or for multiple conditions
-            filter_dict = {
-                "$or": [
-                    {"variable_name": {"$eq": base_variable}},
-                    {"variable_name": {"$eq": variable_with_suffix}}
-                ]
-            }
-            if self.verbose:
-                print(f"   🔧 Filter: {filter_dict}")
-            # Get embedding dimension - we need a valid vector even for metadata-only queries
             embed_dim = 1536  # Default for text-embedding-3-small
             try:
                 if hasattr(self.embedder, 'model') and 'small' in str(self.embedder.model).lower():
@@ -159,538 +225,306 @@ class CrosstabRetriever:
             except:
                 pass
-            # Use a dummy vector (all zeros is fine for metadata-filtered queries)
-            # Pinecone requires a vector but with exact filters, ranking won't matter
             dummy_vector = [0.0] * embed_dim
-            result = index.query(
-                vector=dummy_vector,
-                top_k=k,
-                namespace=namespace,
-                filter=filter_dict,
-                include_metadata=True
-            )
-            if self.verbose:
-                print(f"   📊 Pinecone query returned {len(result.matches)} matches")
-            docs = []
-            for match in result.matches:
-                metadata = match.metadata or {}
-                # Debug: print what we found
                 if self.verbose:
-                    found_var = metadata.get("variable_name", "N/A")
-                    found_qid = metadata.get("question_id", "N/A")
-                    print(f"   📄 Found: variable_name='{found_var}', question_id='{found_qid}'")
-                # Pinecone stores content differently depending on how it was uploaded
-                # Try multiple ways to get the content
-                content = None
-                # Method 1: Check if there's a 'text' field in metadata (LangChain storage)
-                if 'text' in metadata:
-                    content = metadata.pop('text', '')
-                # Method 2: Check if content is in the match object itself
-                elif hasattr(match, 'values') and match.values:
-                    # This shouldn't happen with metadata filtering, but just in case
-                    pass
-                # Method 3: Try to reconstruct from metadata if available
-                elif 'page_content' in metadata:
-                    content = metadata.pop('page_content', '')
-                # If we still don't have content, we can't use this document
-                if not content:
                     if self.verbose:
-                        print(f"   ⚠️  No content found for match, skipping")
-                    continue
-                docs.append(Document(page_content=content, metadata=metadata))
             if self.verbose:
-                print(f"   ✅ Successfully loaded {len(docs)} document(s)")
-            # Sort by chunk_index to maintain order
-            docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
-            return docs[:MAX_CROSSTAB_CHUNKS]
         except Exception as e:
             if self.verbose:
-                print(f"   ❌ Error with metadata filter: {e}")
-            # Fallback: if metadata filtering fails, try fetching sample documents to debug
-            if self.verbose:
-                print(f"   🔄 Falling back to manual filtering...")
-            try:
-                # Try to fetch a sample to see what's actually in the namespace
-                # First, try fetching without filter to see what variable names exist
-                sample_result = index.query(
-                    vector=[0.0] * 1536,  # Dummy vector
-                    top_k=10,  # Just get a few samples
-                    namespace=namespace,
-                    include_metadata=True
-                )
-                if self.verbose and sample_result.matches:
-                    print(f"   📋 Sample variables in namespace:")
-                    for sample in sample_result.matches[:5]:
-                        sample_meta = sample.metadata or {}
-                        sample_var = sample_meta.get("variable_name", "N/A")
-                        sample_qid = sample_meta.get("question_id", "N/A")
-                        print(f"      - variable_name: '{sample_var}', question_id: '{sample_qid}'")
-                # Now try to find matches manually
-                result = index.query(
-                    vector=[0.0] * 1536,  # Dummy vector
-                    top_k=k * 2,  # Get more to filter from
-                    namespace=namespace,
-                    include_metadata=True
-                )
-                docs = []
-                for match in result.matches:
-                    metadata = match.metadata or {}
-                    var_name = metadata.get("variable_name", "")
-                    question_id = metadata.get("question_id", "")
-                    # Check if this matches our variable (case-insensitive)
-                    # Try matching both "VAND15" and "VAND15_crosstab" formats
-                    var_match = (base_variable.lower() == var_name.lower() or
-                                variable_with_suffix.lower() == var_name.lower() or
-                                question_id.lower().startswith(base_variable.lower() + "_") or
-                                question_id.lower().startswith(base_variable.lower()))
-                    if var_match:
-                        # Try to get content
-                        content = metadata.pop('text', '') or metadata.pop('page_content', '') or ''
-                        if content:
-                            docs.append(Document(page_content=content, metadata=metadata))
-                        elif self.verbose:
-                            print(f"   ⚠️  Matched variable '{var_name}' but no content found")
-                docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
-                if self.verbose:
-                    print(f"   ✅ Fallback found {len(docs)} document(s)")
-                return docs[:MAX_CROSSTAB_CHUNKS]
-            except Exception as fallback_error:
-                if self.verbose:
-                    print(f"   ❌ Fallback also failed: {fallback_error}")
-                return []
-# -------------------------
-# LLM summarizer
-# -------------------------
-class CrosstabSummarizer:
-    def __init__(self, llm_model: str = LLM_MODEL, openai_api_key: str = OPENAI_API_KEY):
-        self.llm = ChatOpenAI(model=llm_model, openai_api_key=openai_api_key, temperature=0.0)
-    def summarize(self, user_query: str, retrieved_docs: List[Document], question_text: Optional[str] = None, top_n_sources: int = 6) -> Dict:
-        if not retrieved_docs:
-            return {"answer": "No relevant crosstab data found for that query.", "sources": []}
-        context_parts, sources = [], []
-        for i, d in enumerate(retrieved_docs):
-            md = d.metadata or {}
-            id_hint = md.get("question_id") or md.get("variable_name") or f"part_{i+1}"
-            content = d.page_content or ""
-            context_parts.append(f"--- Part {i+1} | {id_hint} ---\n{content}")
-            sources.append(id_hint)
-        context_text = "\n\n".join(context_parts)
-        # Load prompts from files
-        system_prompt = _load_prompt_file("crosstab_rag_prompt_system.txt")
-        question_context = f"\n\nSURVEY QUESTION THAT WAS RETRIEVED: {question_text}" if question_text else ""
-        relevance_check = (
-            "\n\n⚠️ FIRST: Check if the retrieved question above is actually relevant to the user's question. "
-            "If it's about a different topic (e.g., user asked about 'economy' but question is about 'unity' or 'politics'), "
-            "you MUST state this clearly and NOT provide detailed analysis of irrelevant data."
-        ) if question_text else ""
-        user_prompt_template = _load_prompt_file("crosstab_rag_prompt_user.txt")
-        user_prompt = user_prompt_template.format(
-            user_query=user_query,
-            question_context=question_context,
-            relevance_check=relevance_check,
-            context_text=context_text
-        )
-        from langchain.schema import HumanMessage, SystemMessage
-        messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
-        try:
-            result = self.llm.invoke(messages)
-            answer = result.content if hasattr(result, 'content') else str(result)
-        except Exception as e:
-            answer = f"Error generating summary: {e}"
-        return {"answer": answer.strip(), "sources": sources[:top_n_sources]}
-# -------------------------
-# Orchestration - full pipeline
-# -------------------------
 class CrosstabsRAG:
-    def __init__(self, questionnaire_rag: QuestionnaireRAG, verbose: bool = False):
-        """
-        Initialize CrosstabsRAG.
-        Args:
-            questionnaire_rag: Initialized QuestionnaireRAG instance to reuse for question matching
-            verbose: Whether to print detailed logging
-        """
         self.questionnaire_rag = questionnaire_rag
         self.verbose = verbose
-        self.retriever = CrosstabRetriever(verbose=verbose)
-        self.summarizer = CrosstabSummarizer()
-    def query(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
         """
-        Query the crosstab system. Extracts poll, year, and month from the query.
-        Uses QuestionnaireRAG to find matching questions, then retrieves crosstab data.
         Args:
-            user_query: The question to answer
-            filters: Optional filters dict (may include topic, year, month, survey_name)
         Returns:
-            Dict with answer, sources, and metadata
         """
-        # Extract year, month, poll from query
-        hints = extract_year_month_poll(user_query)
-        year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
-        # If missing required info, try to get from filters
-        if not year and filters and "year" in filters:
-            year = str(filters["year"])
-        if not month and filters and "month" in filters:
-            month = filters["month"]
-        if not poll and filters and "survey_name" in filters:
-            poll = "Vanderbilt_Unity_Poll"  # Default mapping
-        # If still missing required info, return error instead of prompting
-        if not all([poll, year, month]):
-            missing = []
-            if not poll: missing.append("poll/survey name")
-            if not year: missing.append("year")
-            if not month: missing.append("month")
-            return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
-        # Build filters for QuestionnaireRAG
-        q_filters = {
-            "year": int(year),
-            "month": month,
-            "survey_name": "Vanderbilt Unity Poll"  # Map from poll variable if needed
-        }
-        # Add topic filter if provided
-        if filters:
-            if self.verbose:
-                print(f"   📥 Received filters: {filters}")
-            if "topic" in filters and filters["topic"]:
-                q_filters["topic"] = filters["topic"]
-                if self.verbose:
-                    print(f"   📌 Added topic filter: {filters['topic']}")
-            elif self.verbose and "topic" not in filters:
-                print(f"   ⚠️  No 'topic' key in filters dict")
-            elif self.verbose:
-                print(f"   ⚠️  Topic filter is empty/None: {filters.get('topic')}")
-        elif self.verbose:
-            print(f"   ⚠️  No filters dict provided to CrosstabsRAG.query()")
-        # Enhance query text to emphasize topic if provided
-        enhanced_query = user_query
-        if filters and "topic" in filters:
-            topic = filters["topic"]
-            # Make sure topic is mentioned prominently in the query
-            if topic.lower() not in enhanced_query.lower():
-                enhanced_query = f"{topic} {enhanced_query}"
-        # Use QuestionnaireRAG to find matching questions
         if self.verbose:
-            print(f"🔍 [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore")
-            print(f"   Query: {enhanced_query}")
-            print(f"   Filters being passed: {q_filters}")
-        try:
-            q_result = self.questionnaire_rag.query_with_metadata(
-                question=enhanced_query,
-                filters=q_filters,
-                k=10  # Get more matches to capture all economy questions
-            )
-        except Exception as e:
-            return {"error": f"Error querying questionnaire: {e}"}
-        source_questions = q_result.get("source_questions", [])
-        if not source_questions:
-            return {"error": "No matching questions found in questionnaire for that query."}
-        if self.verbose:
-            print(f"✅ [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
-            for i, q in enumerate(source_questions[:3], 1):
-                var = q.get("variable_name", "unknown")
-                qtext = q.get("question_text", "")[:80]
-                print(f"   {i}. {var}: {qtext}...")
-        # Build namespace for crosstab retrieval
-        namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
-        # Process ALL matched questions (not just the first one)
-        all_question_answers = []
-        all_sources = []
-        matched_variables = []
-        for matched_question in source_questions:
-            variable_name = matched_question["variable_name"]
-            question_text = matched_question["question_text"]
             if self.verbose:
-                print(f"\n🔍 [CrosstabRAG] Step 2: Processing {variable_name}")
-                print(f"   Namespace: {namespace}")
-                print(f"   Variable: {variable_name}")
-            # Retrieve crosstab chunks for this specific variable
-            crosstab_docs = self.retriever.retrieve_parts_for_variable(
-                namespace=namespace,
-                variable_prefix=variable_name,
-                user_query=user_query,
-                k=PINECONE_RETRIEVE_K
             )
-            if not crosstab_docs:
-                if self.verbose:
-                    print(f"   ⚠️  No crosstab data found for {variable_name}")
-                continue
-            if self.verbose:
-                print(f"   ✅ Retrieved {len(crosstab_docs)} crosstab chunk(s)")
-                chunk_ids = [d.metadata.get("question_id", d.metadata.get("variable_name", "unknown")) for d in crosstab_docs[:3]]
-                print(f"   Chunk IDs: {', '.join(chunk_ids)}{' ...' if len(crosstab_docs) > 3 else ''}")
-            # Summarize this question's crosstab data
-            summary = self.summarizer.summarize(
-                user_query=user_query,
-                retrieved_docs=crosstab_docs,
-                question_text=question_text,
-                top_n_sources=6
-            )
-            # Add question identifier to the answer
-            question_header = f"\n\n--- Question: {variable_name} ---\n{question_text}\n"
-            question_answer = question_header + summary["answer"].strip()
-            all_question_answers.append(question_answer)
-            all_sources.extend(summary["sources"])
-            matched_variables.append(variable_name)
-        if not all_question_answers:
-            return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
-        if self.verbose:
-            print(f"\n🔍 [CrosstabRAG] Step 3: Combining {len(all_question_answers)} question(s)")
-        # Combine all question answers into a single comprehensive answer
-        combined_answer = "\n\n".join(all_question_answers)
-        # Add overall citation block
-        citation_block = (
-            f"\n\n---\nSource: {poll.replace('_', ' ')}, {month} {year}\n"
-            f"Questions analyzed: {', '.join(matched_variables)}\n"
-            f"Total questions: {len(matched_variables)}\n"
-        )
-        combined_answer = combined_answer + citation_block
-        return {
-            "answer": combined_answer,
-            "sources": list(set(all_sources)),  # Deduplicate sources
-            "matched_variable": matched_variables[0] if len(matched_variables) == 1 else f"{len(matched_variables)} questions",
-            "matched_variables": matched_variables,  # Add all matched variables
-            "matched_question": source_questions[0]["question_text"] if source_questions else "",
-            "namespace_used": namespace,
-            "survey_info": {"poll": poll, "year": year, "month": month}
-        }
-    def retrieve_raw_data(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
-        """
-        Retrieve raw data without LLM summarization.
-        Used by agent framework to get raw data for synthesis.
-        Args:
-            user_query: The question to answer
-            filters: Optional filters dict (may include topic, year, month, survey_name)
-        Returns:
-            Dict with crosstab_docs_by_variable, matched_questions, namespace_used, survey_info
-        """
-        # Extract year, month, poll from query
-        hints = extract_year_month_poll(user_query)
-        year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
-        # If missing required info, try to get from filters
-        if not year and filters and "year" in filters:
-            year = str(filters["year"])
-        if not month and filters and "month" in filters:
-            month = filters["month"]
-        if not poll and filters and "survey_name" in filters:
-            poll = "Vanderbilt_Unity_Poll"  # Default mapping
-        # If still missing required info, return error instead of prompting
-        if not all([poll, year, month]):
-            missing = []
-            if not poll: missing.append("poll/survey name")
-            if not year: missing.append("year")
-            if not month: missing.append("month")
-            return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
-        # Build filters for QuestionnaireRAG
-        q_filters = {
-            "year": int(year),
-            "month": month,
-            "survey_name": "Vanderbilt Unity Poll"  # Map from poll variable if needed
-        }
-        # Add topic filter if provided
-        if filters:
-            if self.verbose:
-                print(f"   📥 Received filters: {filters}")
-            if "topic" in filters and filters["topic"]:
-                q_filters["topic"] = filters["topic"]
-                if self.verbose:
-                    print(f"   📌 Added topic filter: {filters['topic']}")
-        # Enhance query text to emphasize topic if provided
-        enhanced_query = user_query
-        if filters and "topic" in filters:
-            topic = filters["topic"]
-            # Make sure topic is mentioned prominently in the query
-            if topic.lower() not in enhanced_query.lower():
-                enhanced_query = f"{topic} {enhanced_query}"
-        # Use QuestionnaireRAG to find matching questions
         if self.verbose:
-            print(f"🔍 [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore (raw data)")
-            print(f"   Query: {enhanced_query}")
-            print(f"   Filters being passed: {q_filters}")
         try:
             q_result = self.questionnaire_rag.retrieve_raw_data(
-                question=enhanced_query,
-                filters=q_filters,
-                k=10  # Get more matches to capture all questions
             )
         except Exception as e:
             return {"error": f"Error querying questionnaire: {e}"}
         source_questions = q_result.get("source_questions", [])
         if not source_questions:
             return {"error": "No matching questions found in questionnaire for that query."}
         if self.verbose:
-            print(f"✅ [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
-            for i, q in enumerate(source_questions[:3], 1):
-                var = q.get("variable_name", "unknown")
-                qtext = q.get("question_text", "")[:80]
-                print(f"   {i}. {var}: {qtext}...")
-        # Build namespace for crosstab retrieval
-        namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
-        # Process ALL matched questions and collect raw crosstab documents
-        crosstab_docs_by_variable = {}
         matched_variables = []
         for matched_question in source_questions:
             variable_name = matched_question["variable_name"]
             question_text = matched_question["question_text"]
-            if self.verbose:
-                print(f"\n🔍 [CrosstabRAG] Step 2: Processing {variable_name} (raw data)")
-                print(f"   Namespace: {namespace}")
-                print(f"   Variable: {variable_name}")
-            # Retrieve crosstab chunks for this specific variable
-            crosstab_docs = self.retriever.retrieve_parts_for_variable(
-                namespace=namespace,
-                variable_prefix=variable_name,
-                user_query=user_query,
-                k=PINECONE_RETRIEVE_K
-            )
-            if not crosstab_docs:
-                if self.verbose:
-                    print(f"   ⚠️  No crosstab data found for {variable_name}")
-                continue
-            if self.verbose:
-                print(f"   ✅ Retrieved {len(crosstab_docs)} crosstab chunk(s)")
-            # Store raw documents without summarization
-            crosstab_docs_by_variable[variable_name] = {
-                "crosstab_docs": crosstab_docs,
-                "question_text": question_text,
-                "matched_question": matched_question
-            }
-            matched_variables.append(variable_name)
-        if not crosstab_docs_by_variable:
-            return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
-        if self.verbose:
-            print(f"\n✅ [CrosstabRAG] Step 2 Complete: Retrieved raw data for {len(matched_variables)} question(s)")
         return {
-            "crosstab_docs_by_variable": crosstab_docs_by_variable,
             "matched_questions": source_questions,
             "matched_variables": matched_variables,
-            "namespace_used": namespace,
-            "survey_info": {"poll": poll, "year": year, "month": month}
         }
-# -------------------------
-# CLI / Interactive
-# -------------------------
-def main():
-    parser = argparse.ArgumentParser(description="Crosstab RAG CLI - query survey crosstabs.")
-    parser.add_argument("--query", "-q", help="Question to ask (if omitted, interactive).", default=None)
-    args = parser.parse_args()
-    # Initialize QuestionnaireRAG first (needed for CrosstabsRAG)
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    pinecone_api_key = os.getenv("PINECONE_API_KEY")
-    if not openai_api_key or not pinecone_api_key:
-        print("Error: Missing API keys")
-        print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
-        return
-    questionnaire_rag = QuestionnaireRAG(
-        openai_api_key=openai_api_key,
-        pinecone_api_key=pinecone_api_key,
-        persist_directory="./questionnaire_vectorstores",
-        verbose=False
-    )
-    system = CrosstabsRAG(questionnaire_rag=questionnaire_rag)
-    if args.query:
-        out = system.query(args.query)
-        if "error" in out:
-            print(f"Error: {out['error']}")
-        else:
-            matched_question = out.get("matched_question", "")
-            if matched_question:
-                print(f"\nSURVEY QUESTION:\n{matched_question}\n")
-            print("ANSWER:\n", out["answer"])
-    else:
-        print("Interactive Crosstab RAG\nType 'quit' to stop.")
-        while True:
-            try:
-                q = input("\nYour question: ").strip()
-                if not q or q.lower() in ("quit","exit"):
-                    break
-                out = system.query(q)
-                if "error" in out:
-                    print(f"Error: {out['error']}")
-                    continue
-                matched_question = out.get("matched_question", "")
-                if matched_question:
-                    print(f"\nSURVEY QUESTION:\n{matched_question}\n")
-                print("ANSWER:\n", out["answer"])
-            except KeyboardInterrupt:
-                break
-if __name__ == "__main__":
-    main()

 """
+Crosstab RAG Module
+------------------
+Retrieves crosstab demographic breakdown data from Pinecone vectorstore.
+Uses question_info for precise namespace matching and metadata filtering.
+Returns raw data only - no synthesis.
 """
 import os
 from typing import List, Dict, Optional, Any
 from pathlib import Path
 from dotenv import load_dotenv
+from langchain_openai import OpenAIEmbeddings
 from langchain.schema import Document
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
 load_dotenv()
+# Import QuestionnaireRAG to reuse question matching when needed
+try:
+    from questionnaire_rag import QuestionnaireRAG
+except ImportError:
+    # Handle case where running as module
+    from .questionnaire_rag import QuestionnaireRAG
 PINECONE_RETRIEVE_K = 100
 MAX_CROSSTAB_CHUNKS = 50
+class CrosstabSummarizer:
+    """Summarizes crosstab data to reduce token usage."""
+    def __init__(self, llm_model: str = None, openai_api_key: str = None):
+        from langchain_openai import ChatOpenAI
+        llm_model = llm_model or os.getenv("OPENAI_MODEL", "gpt-4o")
+        openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
+        self.llm = ChatOpenAI(model=llm_model, openai_api_key=openai_api_key, temperature=0.0)
+    def summarize(
+        self,
+        user_query: str,
+        retrieved_docs: List[Document],
+        question_text: Optional[str] = None,
+        top_n_sources: int = 6
+    ) -> Dict:
+        """Summarize crosstab data, extracting relevant demographic breakdowns."""
+        if not retrieved_docs:
+            return {"answer": "No relevant crosstab data found for that query.", "sources": []}
+        context_parts, sources = [], []
+        for i, d in enumerate(retrieved_docs):
+            # Handle both Document objects and dicts (from checkpoint deserialization)
+            if hasattr(d, 'metadata'):
+                md = d.metadata or {}
+                content = d.page_content or ""
+            elif isinstance(d, dict):
+                md = d.get("metadata", {})
+                content = d.get("page_content", "")
+            else:
+                md = {}
+                content = ""
+            id_hint = md.get("question_id") or md.get("variable_name") or f"part_{i+1}"
+            context_parts.append(f"--- Part {i+1} | {id_hint} ---\n{content}")
+            sources.append(id_hint)
+        context_text = "\n\n".join(context_parts)
+        # Load prompts
+        prompt_dir = Path(__file__).parent / "prompts"
+        system_prompt_path = prompt_dir / "crosstab_rag_prompt_system.txt"
+        user_prompt_path = prompt_dir / "crosstab_rag_prompt_user.txt"
+        system_prompt = system_prompt_path.read_text(encoding="utf-8") if system_prompt_path.exists() else ""
+        question_context = f"\n\nSURVEY QUESTION THAT WAS RETRIEVED: {question_text}" if question_text else ""
+        relevance_check = (
+            "\n\n⚠️ RELEVANCE: The retrieved question IS relevant to the user's query. "
+            "Remember: ALL subtopics, specific examples, and related aspects ARE relevant:\n"
+            "- 'personal financial situation' IS about economy\n"
+            "- 'tariffs' IS about economy\n"
+            "- 'stock market' IS about economy\n"
+            "- 'gender-affirming healthcare' IS about healthcare\n"
+            "- 'Biden approval' IS about presidential approval\n"
+            "Only flag as irrelevant if about a COMPLETELY UNRELATED topic (e.g., user asked 'economy' but question is about 'sports teams'). "
+            "When in doubt, ANALYZE THE DATA - do not reject it."
+        ) if question_text else ""
+        user_prompt_template = user_prompt_path.read_text(encoding="utf-8") if user_prompt_path.exists() else "{user_query}\n\n{context_text}"
+        user_prompt = user_prompt_template.format(
+            user_query=user_query,
+            question_context=question_context,
+            relevance_check=relevance_check,
+            context_text=context_text
+        )
+        from langchain.schema import HumanMessage, SystemMessage
+        messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
+        try:
+            result = self.llm.invoke(messages)
+            answer = result.content if hasattr(result, 'content') else str(result)
+        except Exception as e:
+            answer = f"Error generating summary: {e}"
+        return {"answer": answer.strip(), "sources": sources[:top_n_sources]}
 class CrosstabRetriever:
+    """Retrieves crosstab chunks from Pinecone using metadata filtering."""
+    def __init__(
+        self,
+        pinecone_api_key: str,
+        index_name: str,
+        embed_model: str,
+        openai_api_key: str,
+        verbose: bool = False
+    ):
         self.pc = Pinecone(api_key=pinecone_api_key)
         self.index_name = index_name
         self.embedder = OpenAIEmbeddings(model=embed_model, openai_api_key=openai_api_key)
         self.verbose = verbose
+    def _build_namespace_from_question_info(self, question_info: Dict[str, Any]) -> Optional[str]:
+        """Build namespace from question_info (year + month)"""
+        year = question_info.get("year")
+        month = question_info.get("month", "")
+        if year and month:
+            return f"Vanderbilt_Unity_Poll_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
+        # Try to extract from poll_date
+        poll_date = question_info.get("poll_date", "")
+        if poll_date:
+            try:
+                from datetime import datetime
+                # Handle format like "2025-June"
+                if "-" in poll_date and len(poll_date.split("-")) == 2:
+                    year_str, month_str = poll_date.split("-")
+                    return f"Vanderbilt_Unity_Poll_{year_str}_{month_str}_cleaned_data_crosstabs".replace(" ", "_")
+                else:
+                    date_obj = datetime.strptime(poll_date, "%Y-%m-%d")
+                    year_str = str(date_obj.year)
+                    month_str = date_obj.strftime("%B")
+                    return f"Vanderbilt_Unity_Poll_{year_str}_{month_str}_cleaned_data_crosstabs".replace(" ", "_")
+            except Exception as e:
+                if self.verbose:
+                    print(f"   ⚠️  Failed to parse poll_date '{poll_date}': {e}")
+        return None
+    def retrieve_parts_for_question_info(
+        self,
+        question_info_list: List[Dict[str, Any]],
+        k: int = PINECONE_RETRIEVE_K,
+        filters: Optional[Dict[str, Any]] = None
+    ) -> Dict[str, List[Document]]:
         """
+        Retrieve crosstab chunks for question_info list.
+        Groups by namespace (year/month) and filters by variable_name and question_id.
         Args:
+            question_info_list: List of question info dicts with variable_name, year, month, question_id
+            k: Number of results to retrieve per variable
+            filters: Optional filters with year/month to constrain namespace search
         Returns:
+            Dict mapping variable_name to list of Document objects
         """
         try:
             index = self.pc.Index(self.index_name)
             stats = index.describe_index_stats()
+            available_namespaces = list(stats.get('namespaces', {}).keys())
+            if not available_namespaces:
+                if self.verbose:
+                    print("   ⚠️  No namespaces found in index")
+                return {}
+            # Build target namespace from filters if provided
+            target_namespace = None
+            if filters:
+                year = filters.get("year")
+                month = filters.get("month", "")
+                if year and month:
+                    target_namespace = f"Vanderbilt_Unity_Poll_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
+                    if target_namespace not in available_namespaces:
+                        if self.verbose:
+                            print(f"   ⚠️  Target namespace {target_namespace} not found in available namespaces")
+                        target_namespace = None
+            # Group questions by namespace
+            questions_by_namespace = {}
+            for q_info in question_info_list:
+                var_name = q_info.get("variable_name")
+                if not var_name:
+                    continue
+                # Try to build namespace from question_info first
+                namespace = self._build_namespace_from_question_info(q_info)
+                if namespace and namespace in available_namespaces:
+                    if namespace not in questions_by_namespace:
+                        questions_by_namespace[namespace] = []
+                    questions_by_namespace[namespace].append(var_name)
+                elif target_namespace:
+                    # Use target namespace from filters
+                    if target_namespace not in questions_by_namespace:
+                        questions_by_namespace[target_namespace] = []
+                    questions_by_namespace[target_namespace].append(var_name)
+                else:
+                    # Only search all namespaces if NO question metadata is available
+                    # This prevents broad searches when question_info is provided
+                    if self.verbose:
+                        print(f"   ⚠️  Could not determine namespace for {var_name} (year={q_info.get('year')}, month={q_info.get('month')})")
+                    # Skip this question rather than searching all namespaces
+                    continue
+            # Get embedding dimension
             embed_dim = 1536  # Default for text-embedding-3-small
             try:
                 if hasattr(self.embedder, 'model') and 'small' in str(self.embedder.model).lower():
             except:
                 pass
             dummy_vector = [0.0] * embed_dim
+            all_docs_by_variable = {}
+            # Build mapping from variable_name to question_id for filtering
+            var_to_question_id = {}
+            for q_info in question_info_list:
+                var_name = q_info.get("variable_name")
+                question_id = q_info.get("question_id")
+                if var_name and question_id:
+                    var_to_question_id[var_name] = question_id
+            # Search each namespace
+            for namespace, var_names in questions_by_namespace.items():
+                if namespace not in available_namespaces:
+                    continue
                 if self.verbose:
+                    print(f"   🔍 Searching namespace: {namespace}")
+                    print(f"      Looking for variables: {', '.join(sorted(set(var_names)))}")
+                    if var_to_question_id:
+                        matched_vars = [v for v in var_names if v in var_to_question_id]
+                        if matched_vars:
+                            print(f"      🔑 Using question_id filter for: {', '.join(sorted(set(matched_vars)))}")
+                # Build filter for variable names and question IDs
+                unique_vars = list(set(var_names))
+                # Build filter conditions - match on either variable_name OR question_id
+                filter_conditions = []
+                for var in unique_vars:
+                    var_conditions = []
+                    # Add variable_name conditions (with and without _crosstab suffix)
+                    var_conditions.append({"variable_name": {"$eq": var}})
+                    var_conditions.append({"variable_name": {"$eq": f"{var}_crosstab"}})
+                    # Add question_id condition if available
+                    # Note: question_id in Pinecone metadata might have _part suffix for chunked crosstabs
+                    # but we match on base question_id and filter in post-processing
+                    if var in var_to_question_id:
+                        question_id = var_to_question_id[var]
+                        var_conditions.append({"question_id": {"$eq": question_id}})
+                    # Combine conditions for this variable with $or
+                    if len(var_conditions) > 1:
+                        filter_conditions.append({"$or": var_conditions})
+                    else:
+                        filter_conditions.append(var_conditions[0])
+                # Combine all variable filters with $or
+                if len(filter_conditions) == 1:
+                    var_filter = filter_conditions[0]
+                else:
+                    var_filter = {"$or": filter_conditions}
+                try:
+                    result = index.query(
+                        vector=dummy_vector,
+                        top_k=k * len(unique_vars),
+                        namespace=namespace,
+                        filter=var_filter,
+                        include_metadata=True
+                    )
                     if self.verbose:
+                        print(f"      📊 Found {len(result.matches)} matches in {namespace}")
+                    for match in result.matches:
+                        metadata = match.metadata or {}
+                        var_name = metadata.get("variable_name", "")
+                        # Handle question_id format like "VAND10_part1"
+                        question_id = metadata.get("question_id", "")
+                        if question_id and "_part" in question_id:
+                            base_var = question_id.split("_part")[0].replace("_crosstab", "")
+                            if base_var in unique_vars:
+                                var_name = base_var
+                        # Check if variable_name has _crosstab suffix
+                        if var_name and var_name.endswith("_crosstab"):
+                            base_var = var_name.replace("_crosstab", "")
+                            if base_var in unique_vars:
+                                var_name = base_var
+                        if not var_name or var_name not in unique_vars:
+                            continue
+                        content = metadata.pop('text', '') or metadata.pop('page_content', '') or ''
+                        if not content:
+                            continue
+                        if var_name not in all_docs_by_variable:
+                            all_docs_by_variable[var_name] = []
+                        all_docs_by_variable[var_name].append(
+                            Document(page_content=content, metadata=metadata)
+                        )
+                except Exception as e:
+                    if self.verbose:
+                        print(f"      ⚠️  Error querying namespace {namespace}: {e}")
+                    continue
+            # Sort documents by chunk_index
+            for var_name in all_docs_by_variable:
+                all_docs_by_variable[var_name].sort(key=lambda d: d.metadata.get("chunk_index", 999))
+                all_docs_by_variable[var_name] = all_docs_by_variable[var_name][:MAX_CROSSTAB_CHUNKS]
             if self.verbose:
+                total_docs = sum(len(docs) for docs in all_docs_by_variable.values())
+                print(f"   ✅ Retrieved {total_docs} total document(s) for {len(all_docs_by_variable)} variable(s)")
+            return all_docs_by_variable
         except Exception as e:
             if self.verbose:
+                print(f"   ❌ Error in retrieve_parts_for_question_info: {e}")
+            return {}
 class CrosstabsRAG:
+    """Crosstabs RAG with question_info-based retrieval."""
+    def __init__(
+        self,
+        questionnaire_rag: QuestionnaireRAG,
+        verbose: bool = False
+    ):
         self.questionnaire_rag = questionnaire_rag
         self.verbose = verbose
+        pinecone_api_key = os.getenv("PINECONE_API_KEY")
+        openai_api_key = os.getenv("OPENAI_API_KEY")
+        index_name = os.getenv("PINECONE_INDEX_NAME_CROSSTABS", "crosstab-index")
+        embed_model = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
+        self.retriever = CrosstabRetriever(
+            pinecone_api_key=pinecone_api_key,
+            index_name=index_name,
+            embed_model=embed_model,
+            openai_api_key=openai_api_key,
+            verbose=verbose
+        )
+    def retrieve_raw_data(
+        self,
+        user_query: str,
+        question_info: Optional[List[Dict[str, Any]]] = None,
+        source_questions: Optional[List[Dict[str, Any]]] = None,
+        filters: Optional[Dict[str, Any]] = None
+    ) -> Dict:
         """
+        Retrieve raw crosstab data.
+        Uses question_info if provided (skips QuestionnaireRAG).
+        Otherwise uses QuestionnaireRAG to find questions, then retrieves crosstabs.
+        Falls back to semantic search if metadata filtering returns no results.
         Args:
+            user_query: User's query (used for QuestionnaireRAG if question_info not provided)
+            question_info: List of question info dicts (preferred - skips QuestionnaireRAG)
+            source_questions: Optional list of full question dicts from previous stage (avoids lookup)
+            filters: Optional filters for QuestionnaireRAG
         Returns:
+            Dict with crosstab_docs_by_variable, matched_questions, namespace_used, survey_info
         """
         if self.verbose:
+            print(f"\n📊 [Crosstabs] Query: {user_query}")
+            if question_info:
+                print(f"🔍 Question info: {len(question_info)} question(s) provided")
+            if filters:
+                print(f"🔍 Filters: {filters}")
+        # If question_info provided, skip QuestionnaireRAG
+        if question_info:
             if self.verbose:
+                print(f"✅ Using provided question_info, skipping QuestionnaireRAG")
+            # Retrieve crosstab data directly
+            crosstab_docs_by_variable = self.retriever.retrieve_parts_for_question_info(
+                question_info_list=question_info,
+                k=PINECONE_RETRIEVE_K,
+                filters=filters
             )
+            if not crosstab_docs_by_variable:
+                return {"error": f"No crosstab data found for {len(question_info)} question(s)."}
+            # Get question metadata - use provided source_questions if available, otherwise lookup
+            if not source_questions:
+                source_questions = []
+                questions_by_id = self.questionnaire_rag.questions_by_id
+                for q_info in question_info:
+                    question_id = q_info.get("question_id")
+                    if question_id and question_id in questions_by_id:
+                        source_questions.append(questions_by_id[question_id])
+                    else:
+                        # Fallback: try to find by variable_name and year/month
+                        var_name = q_info.get("variable_name")
+                        year = q_info.get("year")
+                        month = q_info.get("month", "")
+                        if var_name:
+                            # Search through questions_by_id for matching variable
+                            for qid, q_data in questions_by_id.items():
+                                if (q_data.get("variable_name") == var_name and
+                                    q_data.get("year") == year and
+                                    q_data.get("month", "") == month):
+                                    source_questions.append(q_data)
+                                    break
+            # Format results
+            formatted_results = {}
+            matched_variables = []
+            all_namespaces = set()
+            for var_name, docs in crosstab_docs_by_variable.items():
+                question_metadata = next(
+                    (q for q in source_questions if q.get("variable_name") == var_name),
+                    {}
+                )
+                question_text = question_metadata.get("question_text", "")
+                if docs:
+                    first_doc_meta = docs[0].metadata
+                    survey_name = first_doc_meta.get("survey_name", "")
+                    all_namespaces.add(survey_name)
+                formatted_results[var_name] = {
+                    "crosstab_docs": docs,
+                    "question_text": question_text or (docs[0].metadata.get("question_text", "") if docs else ""),
+                    "matched_question": question_metadata
+                }
+                matched_variables.append(var_name)
+            return {
+                "crosstab_docs_by_variable": formatted_results,
+                "matched_questions": source_questions,
+                "matched_variables": matched_variables,
+                "namespace_used": list(all_namespaces),
+                "survey_info": {"poll": "Vanderbilt_Unity_Poll", "year": None, "month": None}
+            }
+        # Otherwise, use QuestionnaireRAG to find questions first
         if self.verbose:
+            print(f"🔍 Using QuestionnaireRAG to find questions")
         try:
             q_result = self.questionnaire_rag.retrieve_raw_data(
+                question=user_query,
+                filters=filters or {},
+                k=10
             )
         except Exception as e:
             return {"error": f"Error querying questionnaire: {e}"}
         source_questions = q_result.get("source_questions", [])
+        question_info_from_questions = q_result.get("question_info", [])
         if not source_questions:
             return {"error": "No matching questions found in questionnaire for that query."}
         if self.verbose:
+            print(f"✅ Found {len(source_questions)} question(s) from QuestionnaireRAG")
+        # Retrieve crosstab data using question_info
+        crosstab_docs_by_variable = self.retriever.retrieve_parts_for_question_info(
+            question_info_list=question_info_from_questions,
+            k=PINECONE_RETRIEVE_K
+        )
+        if not crosstab_docs_by_variable:
+            return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions."}
+        # Format results
+        formatted_results = {}
         matched_variables = []
+        all_namespaces = set()
         for matched_question in source_questions:
             variable_name = matched_question["variable_name"]
             question_text = matched_question["question_text"]
+            if variable_name in crosstab_docs_by_variable:
+                formatted_results[variable_name] = {
+                    "crosstab_docs": crosstab_docs_by_variable[variable_name],
+                    "question_text": question_text,
+                    "matched_question": matched_question
+                }
+                matched_variables.append(variable_name)
+                if crosstab_docs_by_variable[variable_name]:
+                    first_doc = crosstab_docs_by_variable[variable_name][0]
+                    survey_name = first_doc.metadata.get("survey_name", "")
+                    all_namespaces.add(survey_name)
         return {
+            "crosstab_docs_by_variable": formatted_results,
             "matched_questions": source_questions,
             "matched_variables": matched_variables,
+            "namespace_used": list(all_namespaces),
+            "survey_info": {"poll": "Vanderbilt_Unity_Poll", "year": None, "month": None}
         }

prompts/crosstab_rag_prompt_system.txt CHANGED Viewed

@@ -1,9 +1,15 @@
 You are a data analyst assistant specialized in interpreting survey crosstab tables.
-## CRITICAL: Relevance Check
-Before answering, check if the retrieved question actually matches the user's query.
-- If the question is about a DIFFERENT topic, explicitly state this
-- Do NOT provide detailed analysis of irrelevant data
-- Only provide detailed analysis if the question is relevant to the user's query
 Provide clear, specific answers based only on the context provided.

 You are a data analyst assistant specialized in interpreting survey crosstab tables.
+## CRITICAL: Assume Relevance Unless Obviously Wrong
+The retrieved questions have already been filtered by topic, so assume they ARE relevant.
+- Subtopics and specific aspects ARE relevant (e.g., "personal finances" IS economy, "tariffs" IS economy, "stock market" IS economy)
+- ONLY reject data if it's about a COMPLETELY unrelated topic (e.g., user asked about "economy" but data is about "favorite sports team")
+- When in doubt, PROVIDE THE ANALYSIS - do not be overly cautious
+## Data Extraction Requirements
+- Extract ACTUAL percentages and counts for each demographic group from the crosstab
+- When sample sizes are shown in the data (e.g., "N=500" or counts in parentheses), include them
+- Present data in structured format (tables when appropriate)
+- DO NOT make up or estimate values - use only what's in the context
 Provide clear, specific answers based only on the context provided.

prompts/relevance_check_prompt.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+You are analyzing conversation continuity in a multi-turn survey data analysis system.
+Your task: Determine if the current question is related to previous conversation and what data can be reused.
+## CONVERSATION HISTORY
+{conversation_summary}
+## PREVIOUSLY RETRIEVED DATA
+{previous_data_summary}
+## CURRENT QUESTION
+{current_question}
+## ANALYSIS REQUIRED
+1. **Is the current question related to the previous conversation?**
+   - YES if: Same topic, same questions, same time period (even if different demographic)
+   - YES if: Asking for trend/analysis of already-shown data
+   - NO if: Completely different topic
+   - NO if: Same topic but different time period (e.g., June 2025 → February 2025)
+2. **Relation Type** (if related):
+   - `same_topic_different_demo`: Same topic/questions, asking for different demographic breakdown
+     * Example: Previous "immigration by party" → Current "immigration by gender"
+   - `trend_analysis`: Asking for analysis/trends from already-retrieved data
+     * Example: Previous showed data from 3 polls → Current "what's the trend?"
+   - `same_topic_different_time`: Same topic but different time period
+     * Example: Previous "immigration June 2025" → Current "immigration February 2025"
+   - `new_topic`: Completely different topic
+     * Example: Previous "immigration" → Current "economy"
+3. **Reusable Data**:
+   - `questions`: true if same questions can be reused (same topic, same time period)
+   - `toplines`: true if overall frequencies already retrieved and still relevant
+   - `crosstabs`: true if demographic breakdowns already retrieved and still relevant
+4. **Time Period Changed**:
+   - true if current question asks about different year/month than previous
+   - false if time period is same or not specified
+## OUTPUT FORMAT
+Return a structured assessment with fields:
+- is_related: boolean
+- relation_type: string (one of the types above)
+- reusable_data: {"questions": boolean, "toplines": boolean, "crosstabs": boolean}
+- time_period_changed: boolean
+- reasoning: string (1-2 sentence explanation)
+## EXAMPLES
+Example 1:
+Previous: "How do immigration responses vary by political party in June 2025?"
+Current: "Let's look at the breakdown by gender as well"
+→ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: false, crosstabs: false}, time_period_changed: false
+Reasoning: Same topic (immigration) and time period (June 2025), just requesting different demographic breakdown (gender instead of party).
+Example 2:
+Previous: "What is Joe Biden's approval rating in June 2025?"
+Current: "Let's examine how this breaks down by gender"
+→ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: true, crosstabs: false}, time_period_changed: false
+Reasoning: Same question (Biden approval) and time period, asking for demographic breakdown of already-retrieved topline data.
+Example 3:
+Previous: "Immigration questions in June 2025"
+Current: "What about February 2025?"
+→ is_related: false, relation_type: "same_topic_different_time", reusable_data: {questions: false, toplines: false, crosstabs: false}, time_period_changed: true
+Reasoning: Same topic but different time period - questions from June 2025 cannot be assumed to exist in February 2025.
+Example 4:
+Previous: Showed Biden approval by party for 3 different polls (June 2024, Sept 2024, June 2025)
+Current: "What's the trend over time?"
+→ is_related: true, relation_type: "trend_analysis", reusable_data: {questions: true, toplines: true, crosstabs: true}, time_period_changed: false
+Reasoning: User wants analysis/trends from already-retrieved and displayed data, no new data retrieval needed.
+Example 5:
+Previous: "How do immigration responses vary by political party?"
+Current: "Now show me the breakdown by gender"
+→ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: false, crosstabs: false}, time_period_changed: false
+Reasoning: Same immigration questions, same time period (unspecified in both), just requesting different demographic breakdown.
+Example 6:
+Previous: "What questions about the economy were asked in 2025?"
+Current: "Tell me about immigration policies"
+→ is_related: false, relation_type: "new_topic", reusable_data: {questions: false, toplines: false, crosstabs: false}, time_period_changed: false
+Reasoning: Completely different topic - economy vs immigration, no data can be reused.
+Example 7:
+Previous: "Biden approval rating June 2025"
+Current: "How does this break down by age?"
+→ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: true, crosstabs: false}, time_period_changed: false
+Reasoning: Same question and time period, asking for age demographic breakdown of already-retrieved approval data.

prompts/research_brief_prompt.txt CHANGED Viewed

@@ -10,6 +10,67 @@ Available data sources:
 {available_months}
 ## ACTIONS
 **1. followup** - Ask clarifying question if ambiguous OR unavailable data requested
@@ -20,13 +81,24 @@ Available data sources:
    - Most queries: single time period, specific question requests
    - Pipeline selection:
      * QUESTIONNAIRE: "what questions", "list questions", "show questions"
-     * TOPLINES: "approval", "ratings", "percentages", "how many", "what %", "response frequencies"
      * CROSSTABS: "vary by", "breakdown by", "by gender/age/race/etc", "differences by"
    - Retrieve ONLY the mentioned time period (no comparison unless explicit)
 **4. execute_stages** - Multi-stage for complex queries
    - Explicit comparisons: "compare X vs Y", "what changed"
    - Queries needing analysis across multiple retrievals
    - Do NOT use for simple follow-ups about different time periods
 ## CONVERSATION CONTEXT RULES
@@ -37,6 +109,61 @@ Available data sources:
 - Create stages per month/question as appropriate
 - Do NOT ask followup if context can be inferred
 **Time Period Queries**:
 - "what about [X]?" = NEW question about X (not comparison)
 - Extract year+month → single-stage (route_to_sources)
@@ -47,9 +174,21 @@ Available data sources:
     - Specific query ("approval in 2025?") → followup if ambiguous
 **Broad Queries** (no time specification):
-- Assume analysis across ALL available polls (last 2+ years)
-- Use execute_stages with one stage per available poll
-- Do NOT ask followup - create stages automatically
 ## FILTERING
 - Map survey names: "Unity Poll" → "Vanderbilt_Unity_Poll"
@@ -60,16 +199,46 @@ Available data sources:
 Simple queries (route_to_sources):
 - "what questions were asked in June 2025?" → questionnaire, year=2025, month=June
 - "what about June 2025?" (after June 2022) → questionnaire, year=2025, month=June (NOT staged)
-- "Trump's approval in June 2025?" → toplines, year=2025, month=June
 - "questions about economy in 2025?" → questionnaire, year=2025, topic='economy'
 Multi-stage (execute_stages):
 - "compare June 2024 vs June 2025" → stage 1: 2024, stage 2: 2025
 - "how do responses vary by gender in 2025?" (no month) → stages for all 2025 months
-- "how do immigration responses vary by party?" (no time) → stages for all available polls
 Follow-up handling:
 - "how do responses vary by gender for each of these questions?" (referencing previous)
-  → Infer months from previous question's year, create stages per month
 - "what was trump's approval in 2025?" → followup: "Which month(s) in 2025?"
-- "June" (short answer) → combine with previous intent, use toplines (approval = data)

 {available_months}
+## VALID TOPICS FOR METADATA FILTERING
+**CRITICAL: When extracting topics from user queries, you MUST use ONLY these standardized topics:**
+- `biden_administration` - Biden, his administration, policies
+- `confidence_institutions` - Trust/confidence in institutions
+- `economy` - Economy, finances, tariffs, inflation, stock market
+- `education` - Education, colleges, universities, schools
+- `elections` - Voting, elections, candidates, electoral process
+- `foreign_policy` - International relations, China, foreign affairs
+- `general` - General topics, unity, division, democracy, other
+- `healthcare` - Health, medical, wellness
+- `immigration` - Immigration, deportation, border, visas, undocumented
+- `judicial` - Courts, judges, rulings, legal system
+- `technology` - AI, artificial intelligence, innovation, tech
+- `trump_administration` - Trump, MAGA, his administration, policies
+**Topic Extraction Guidelines:**
+- If user asks about "deporting undocumented immigrants" → use topic=`immigration`
+- If user asks about "tariffs" or "stock market" → use topic=`economy`
+- If user asks about "colleges" → use topic=`education`
+- If user asks about "Trump policies" → use topic=`trump_administration`
+- If user asks about "Biden approval" → use topic=`biden_administration`
+- If topic doesn't clearly map to above list → use topic=`general` OR rely on semantic search (no topic filter)
+- **NEVER invent new topics** - only use the 12 topics listed above
+## EFFICIENCY RULES (CRITICAL - REDUCE API CALLS)
+**Topic-only CROSSTABS queries** (e.g., "how do immigration responses vary by X?"):
+- NEVER create one stage per poll - this causes 9+ unnecessary QuestionnaireRAG queries
+- ALWAYS use 2-stage approach:
+  1. Stage 1: QUESTIONNAIRE with topic filter (NO year/month) → finds ALL questions across all polls in ONE query
+  2. Stage 2: CROSSTABS with question_ids from Stage 1 → searches all namespaces efficiently
+- This reduces API calls from 9+ to just 2 stages total
+**Topic-based TOPLINES queries** (CRITICAL - MUST IDENTIFY QUESTIONS FIRST):
+- NEVER use route_to_sources for topic-based toplines queries (e.g., "Joe Biden approval", "Trump approval")
+- ALWAYS use 2-stage approach:
+  1. Stage 1: QUESTIONNAIRE with topic/person filter + year/month → identifies relevant question(s)
+  2. Stage 2: TOPLINES with question_info from Stage 1 → retrieves response data
+- This ensures correct question identification before data retrieval
+- Only use route_to_sources with TOPLINES if:
+  * User explicitly mentions a variable name/question ID (e.g., "VAND5", "VAND15")
+  * Questions were already retrieved in previous conversation turns
+**When question IDs are available**:
+- If previous stage found questions (questionnaire/toplines), ALWAYS use question_ids filter
+- This skips QuestionnaireRAG entirely in crosstabs queries (saves API calls)
+## WHEN TO ASK FOLLOWUP vs BROAD SEARCH
+**ASK FOLLOWUP for:**
+- QUESTIONNAIRE queries without time period: "what questions about X were asked?" → Ask for time period
+- TOPLINES queries without time period: "what was approval?" → Ask for time period
+- Queries that are ambiguous or missing critical information
+**DO NOT ASK FOLLOWUP for:**
+- CROSSTABS queries without time period: "how do responses about X vary by Y?" → Do broad search across all polls
+  * These queries benefit from cross-poll analysis
+  * Use 2-stage approach: Stage 1 finds all questions, Stage 2 gets crosstabs
 ## ACTIONS
 **1. followup** - Ask clarifying question if ambiguous OR unavailable data requested
    - Most queries: single time period, specific question requests
    - Pipeline selection:
      * QUESTIONNAIRE: "what questions", "list questions", "show questions"
+     * TOPLINES: ONLY use route_to_sources with TOPLINES if:
+       - User explicitly mentions a variable name/question ID (e.g., "VAND5", "VAND15")
+       - Questions were already retrieved in previous conversation turns (system will extract question_info automatically)
+       - DO NOT use route_to_sources for topic-based toplines queries (e.g., "Joe Biden approval", "Trump approval")
+       - For topic-based toplines queries, use execute_stages with Stage 1 querying QUESTIONNAIRE first
      * CROSSTABS: "vary by", "breakdown by", "by gender/age/race/etc", "differences by"
    - Retrieve ONLY the mentioned time period (no comparison unless explicit)
 **4. execute_stages** - Multi-stage for complex queries
    - Explicit comparisons: "compare X vs Y", "what changed"
    - Queries needing analysis across multiple retrievals
+   - Topic-only crosstab queries (see EFFICIENCY RULES above)
+   - **CRITICAL: Topic-based TOPLINES queries** (e.g., "Joe Biden approval", "Trump approval", "immigration responses"):
+     * ALWAYS use 2-stage approach:
+       1. Stage 1: QUESTIONNAIRE with topic/person filter + year/month → identifies relevant question(s)
+       2. Stage 2: TOPLINES with question_info from Stage 1 → retrieves response data
+     * This ensures correct question identification before data retrieval
+     * DO NOT use route_to_sources for topic-based toplines queries
    - Do NOT use for simple follow-ups about different time periods
 ## CONVERSATION CONTEXT RULES
 - Create stages per month/question as appropriate
 - Do NOT ask followup if context can be inferred
+**Relevance Analysis** (CRITICAL for efficiency):
+- If RELEVANCE ANALYSIS section is provided in the conversation context above:
+  * ALWAYS check the relation_type to determine the correct strategy
+  * If relation_type = "same_topic_different_demo":
+    - Use route_to_sources with TOPLINES or CROSSTABS (single-stage)
+    - Questions are already identified and available from previous turn
+    - System will automatically extract question_info
+    - DO NOT create execute_stages with QUESTIONNAIRE query
+    - Example: Previous "immigration by party" → Current "immigration by gender"
+      → Use route_to_sources with CROSSTABS (NOT execute_stages)
+  * If relation_type = "trend_analysis":
+    - Use action='answer' to analyze already-retrieved data
+    - DO NOT retrieve any new data from any pipeline
+    - Synthesize answer from conversation history and previously shown results
+    - Example: Previous showed data from 3 polls → Current "what's the trend?"
+      → Use action='answer' (NOT execute_stages or route_to_sources)
+  * If relation_type = "same_topic_different_time":
+    - Treat as NEW QUERY even though topic is same
+    - Time period changed, so previous questions may not exist
+    - Must query QUESTIONNAIRE for new time period
+    - Use execute_stages with Stage 1 = QUESTIONNAIRE, Stage 2 = TOPLINES/CROSSTABS
+    - Example: Previous "June 2025" → Current "February 2025"
+      → Use execute_stages with QUESTIONNAIRE query for February 2025
+  * If relation_type = "new_topic":
+    - Treat as completely new query
+    - Follow standard routing logic below
+    - No data can be reused from previous conversation
+- If NO RELEVANCE ANALYSIS section (first turn or relevance check unavailable):
+  * Follow standard routing logic below
+**Previously Retrieved Questions** (CRITICAL for efficiency):
+- System automatically detects when questions were retrieved in previous turns
+- If RELEVANCE ANALYSIS shows relation_type = "same_topic_different_demo":
+  * Questions are already identified - DO NOT query QUESTIONNAIRE
+  * Use route_to_sources with TOPLINES or CROSSTABS (single-stage)
+  * System automatically extracts question_info from previous results
+  * Example: Previous "immigration by party" → Current "immigration by gender"
+    → Use route_to_sources with CROSSTABS (NOT execute_stages)
+- If RELEVANCE ANALYSIS shows time_period_changed = true:
+  * Previous questions are NOT reusable
+  * Must re-query QUESTIONNAIRE for new time period
+- If RELEVANCE ANALYSIS shows relation_type = "trend_analysis":
+  * All data already retrieved and displayed
+  * Use action='answer' to synthesize from history
+  * DO NOT create any data retrieval stages
+**Question ID Tracking** (CRITICAL for efficiency):
+- If previous query used TOPLINES pipeline, extract variable_name from toplines results
+- If previous query used QUESTIONNAIRE pipeline, extract question_id or variable_name
+- For follow-up queries like "how does this vary by gender":
+  * If question IDs are available from previous stage → use CROSSTABS with question_ids filter
+  * This SKIPS QuestionnaireRAG entirely (more efficient)
+  * Example: Stage 1 (toplines) finds VAND15 → Stage 2 (crosstabs) uses question_ids=["VAND15"]
+  * Set use_previous_results_for: "Extract question IDs from stage 1 for crosstab filtering"
 **Time Period Queries**:
 - "what about [X]?" = NEW question about X (not comparison)
 - Extract year+month → single-stage (route_to_sources)
     - Specific query ("approval in 2025?") → followup if ambiguous
 **Broad Queries** (no time specification):
+- For CROSSTABS queries with topic only (e.g., "how do immigration responses vary by X?"):
+  * Stage 1: Query QUESTIONNAIRE with topic filter (NO year/month) to find ALL questions across all polls
+  * Stage 2: Query CROSSTABS with question_ids from Stage 1 (skips QuestionnaireRAG, searches all namespaces)
+  * Set use_previous_results_for: "Extract question IDs from stage 1 for crosstab filtering"
+  * This is MUCH more efficient than creating one stage per poll
+  * DO NOT ask followup - these queries benefit from cross-poll analysis
+- For QUESTIONNAIRE queries without time period (e.g., "what questions about economy were asked?"):
+  * Ask followup: "Which time period are you interested in? (e.g., 2025, June 2025, or all polls)"
+  * These queries need time context to be useful
+- For TOPLINES queries without time period:
+  * Ask followup: "Which time period are you interested in? (e.g., 2025, June 2025)"
+  * These queries need time context to retrieve specific response data
+- For other broad queries:
+  * Assume analysis across ALL available polls (last 2+ years)
+  * Use execute_stages with one stage per available poll
 ## FILTERING
 - Map survey names: "Unity Poll" → "Vanderbilt_Unity_Poll"
 Simple queries (route_to_sources):
 - "what questions were asked in June 2025?" → questionnaire, year=2025, month=June
 - "what about June 2025?" (after June 2022) → questionnaire, year=2025, month=June (NOT staged)
+- "VAND5 responses in June 2025?" → toplines, year=2025, month=June (variable explicitly mentioned)
 - "questions about economy in 2025?" → questionnaire, year=2025, topic='economy'
+Topic-based toplines queries (MUST use execute_stages):
+- "Trump's approval in June 2025?" → execute_stages:
+  * Stage 1: QUESTIONNAIRE with topic='trump_administration' or query="Trump approval", year=2025, month=June
+  * Stage 2: TOPLINES with question_info from Stage 1
+- "Joe Biden's approval rating in June 2025?" → execute_stages:
+  * Stage 1: QUESTIONNAIRE with topic='biden_administration' or query="Joe Biden approval", year=2025, month=June
+  * Stage 2: TOPLINES with question_info from Stage 1
+Queries requiring followup:
+- "what questions about the economy were asked?" (no time) → followup: "Which time period are you interested in?"
+- "what questions were asked?" (no topic, no time) → followup: "Which topic and time period?"
+- "Trump's approval?" (no time) → followup: "Which time period are you interested in?"
 Multi-stage (execute_stages):
 - "compare June 2024 vs June 2025" → stage 1: 2024, stage 2: 2025
 - "how do responses vary by gender in 2025?" (no month) → stages for all 2025 months
+- "how do immigration responses vary by party?" (no time, topic-only crosstab query):
+  * Stage 1: QUESTIONNAIRE with topic='immigration' (no year/month) → finds ALL immigration questions
+  * Stage 2: CROSSTABS with question_ids from Stage 1 → searches all namespaces efficiently
+  * Set use_previous_results_for: "Extract question IDs from stage 1"
+  * DO NOT create one stage per poll - this is inefficient!
+  * DO NOT ask followup - cross-poll analysis is valuable for crosstab queries
 Follow-up handling:
 - "how do responses vary by gender for each of these questions?" (referencing previous)
+  → If questions were ALREADY retrieved in previous conversation turn:
+    * Use route_to_sources with CROSSTABS (single-stage)
+    * System automatically extracts question_info from previous results
+    * DO NOT create execute_stages with Stage 1 querying QuestionnaireRAG
+  → If no previous results in conversation, infer months from previous question's year, create stages per month
 - "what was trump's approval in 2025?" → followup: "Which month(s) in 2025?"
+- "June" (short answer) → combine with previous intent, use execute_stages:
+  * Stage 1: QUESTIONNAIRE with topic='trump_administration' or query="Trump approval", year=2025, month=June
+  * Stage 2: TOPLINES with question_info from Stage 1
+- "how does this vary by gender?" (after approval query)
+  → If previous turn already retrieved questions:
+    * Use route_to_sources with CROSSTABS (single-stage, question_info extracted automatically)
+  → If previous turn only retrieved toplines (no question_info):
+    * Stage 1: QUESTIONNAIRE to identify question from toplines variable_name
+    * Stage 2: CROSSTABS with question_ids from Stage 1

prompts/synthesis_prompt_system.txt CHANGED Viewed

@@ -31,9 +31,16 @@ You are a survey data analyst synthesizing research results.
 - Instead: "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove"
 - If the exact breakdown isn't in the context, state "Gender breakdown data is not available in the retrieved crosstabs"
-**3. RELEVANCE CHECK**
-- Only synthesize data relevant to the user's question
-- If information doesn't match, explicitly state this
 - If crosstabs exist but don't contain the requested demographic breakdown, state this clearly
 **4. DATA ACCURACY**
@@ -56,6 +63,8 @@ You are a survey data analyst synthesizing research results.
 - Acknowledge missing data naturally
 **7. PRESENTATION FORMAT**
 - Markdown tables for demographic breakdowns (political party, age, gender)
 - Clear headers, consistent formatting
 - Time-series organized by time period

 - Instead: "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove"
 - If the exact breakdown isn't in the context, state "Gender breakdown data is not available in the retrieved crosstabs"
+**3. RELEVANCE CHECK - BE PERMISSIVE**
+- The data has ALREADY been filtered by topic, so assume it IS relevant
+- Subtopics and specific aspects ARE ALWAYS relevant:
+  * "personal financial situation" IS economy
+  * "tariffs" IS economy
+  * "stock market concerns" IS economy
+  * "gender-affirming healthcare" IS healthcare
+  * "Biden approval" IS presidential approval
+- ONLY reject data if about a COMPLETELY unrelated topic (e.g., user asked "economy" but data is "favorite sports team")
+- When in doubt, PRESENT THE DATA - do not be overly cautious
 - If crosstabs exist but don't contain the requested demographic breakdown, state this clearly
 **4. DATA ACCURACY**
 - Acknowledge missing data naturally
 **7. PRESENTATION FORMAT**
+- **PRESENT ALL QUESTIONS**: If multiple questions are in the data, present ALL of them, not just one
+- For EACH question include: Question text, poll date/year/month, sample size (N), and demographic breakdowns
 - Markdown tables for demographic breakdowns (political party, age, gender)
 - Clear headers, consistent formatting
 - Time-series organized by time period

prompts/synthesis_prompt_user.txt CHANGED Viewed

@@ -18,10 +18,16 @@ Retrieved raw data:
 - INCORRECT: "The retrieved data provides a list of questions..."
 - Include metadata (year/month/poll) when available
-**1. RELEVANCE CHECK FIRST**
-- Check if each stage's data actually answers the question
-- If data is about a DIFFERENT topic, state this explicitly
-- Do NOT provide detailed analysis of irrelevant data
 **2. EXTRACT ACTUAL NUMBERS - NO GENERIC DESCRIPTIONS**
 - **QUESTIONNAIRE**: Format questions with text, response options, topics
@@ -37,6 +43,12 @@ Retrieved raw data:
 - Format numbers/percentages clearly
 **4. PRESENTATION FORMAT**
 - Use markdown tables for demographic breakdowns:
   ```
   | Response Option | Democrat | Republican | Independent |

 - INCORRECT: "The retrieved data provides a list of questions..."
 - Include metadata (year/month/poll) when available
+**1. ASSUME RELEVANCE - BE PERMISSIVE**
+- The data has ALREADY been filtered by topic, so it IS relevant
+- Subtopics and specific aspects ARE ALWAYS relevant:
+  * "personal financial situation" IS about economy
+  * "tariffs" IS about economy
+  * "stock market" IS about economy
+  * "gender-affirming healthcare" IS about healthcare
+  * "Trump approval" IS about presidential approval
+- ONLY reject if about COMPLETELY unrelated topic (e.g., user asked "economy" but data is "favorite sports team")
+- When in doubt, PRESENT THE DATA - err on the side of inclusion
 **2. EXTRACT ACTUAL NUMBERS - NO GENERIC DESCRIPTIONS**
 - **QUESTIONNAIRE**: Format questions with text, response options, topics
 - Format numbers/percentages clearly
 **4. PRESENTATION FORMAT**
+- **CRITICAL: PRESENT ALL QUESTIONS** - If you have data for 5 questions, present ALL 5, not just 1
+- For EACH question, include:
+  * Question text
+  * Poll date (year/month)
+  * Sample size (N)
+  * Complete demographic breakdown with actual percentages
 - Use markdown tables for demographic breakdowns:
   ```
   | Response Option | Democrat | Republican | Independent |

questionnaire_rag.py CHANGED Viewed

@@ -1,12 +1,9 @@
 """
-Questionnaire RAG with better filtering and anti-hallucination measures.
-Key improvements:
-1. Correct Pinecone filter syntax
-2. Post-retrieval validation of filters
-3. Stronger anti-hallucination prompts
-4. Explicit checks for data existence
-5. Fuzzy survey name matching
 """
 import os
@@ -14,11 +11,9 @@ import json
 from typing import List, Dict, Any, Optional
 from pathlib import Path
-from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
-from langchain.prompts import ChatPromptTemplate
-from langchain.schema.output_parser import StrOutputParser
 try:
     from dotenv import load_dotenv
@@ -27,23 +22,29 @@ except ImportError:
     pass
-def _load_prompt_file(filename: str) -> str:
-    """Load a prompt file from the prompts directory"""
-    prompt_dir = Path(__file__).parent / "prompts"
-    prompt_path = prompt_dir / filename
-    if not prompt_path.exists():
-        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
-    return prompt_path.read_text(encoding="utf-8")
 class QuestionnaireRAG:
-    """
-    Improved questionnaire RAG with:
-    - Better Pinecone filtering
-    - Post-retrieval validation
-    - Anti-hallucination measures
-    - Fuzzy survey name matching
-    """
     def __init__(
         self,
@@ -62,17 +63,6 @@ class QuestionnaireRAG:
             model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
         )
-        # Initialize LLM
-        chat_model = os.getenv("OPENAI_MODEL", "gpt-4o")
-        self.llm = ChatOpenAI(model=chat_model, temperature=0)
-        # Load vector store
-        if not os.path.exists(persist_directory):
-            raise ValueError(
-                f"Vector store not found at {persist_directory}\n"
-                "Run create_questionnaire_vectorstores.py first"
-            )
         # Connect to Pinecone
         index_name = os.getenv("PINECONE_INDEX_NAME", "poll-questionnaire-index")
         namespace = os.getenv("PINECONE_NAMESPACE") or None
@@ -95,127 +85,90 @@ class QuestionnaireRAG:
     def _load_catalog(self) -> Dict[str, Dict]:
         """Load poll catalog"""
         catalog_path = Path(self.persist_directory) / "poll_catalog.json"
-        if catalog_path.exists():
-            with open(catalog_path, 'r') as f:
-                return json.load(f)
-        return {}
     def _load_questions_index(self) -> Dict[str, Dict]:
         """Load questions index"""
         questions_path = Path(self.persist_directory) / "questions_index.json"
-        if questions_path.exists():
-            with open(questions_path, 'r') as f:
-                return json.load(f)
-        return {}
-    def get_available_survey_names(self) -> List[str]:
-        """Get list of unique survey names from the catalog"""
-        survey_names = set()
-        for info in self.poll_catalog.values():
-            survey_names.add(info["survey_name"])
-        return sorted(survey_names)
     def _fuzzy_match_survey_name(self, requested_name: str) -> Optional[str]:
-        """
-        Fuzzy match a requested survey name to an actual stored name.
-        Examples:
-        - "Unity Poll" → "Vanderbilt_Unity_Poll"
-        - "unity poll" → "Vanderbilt_Unity_Poll"
-        - "Vanderbilt Unity" → "Vanderbilt_Unity_Poll"
-        """
-        # Get all unique survey names
-        available_names = self.get_available_survey_names()
-        # Normalize the requested name
         normalized_requested = requested_name.lower().replace("_", " ").replace("-", " ")
-        # Try exact match first (case-insensitive)
         for stored_name in available_names:
             normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
             if normalized_requested == normalized_stored:
                 return stored_name
-        # Try substring matching - check if requested is in stored
-        for stored_name in available_names:
-            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
-            if normalized_requested in normalized_stored:
-                return stored_name
-        # Try reverse - check if stored is in requested
-        for stored_name in available_names:
-            normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
-            if normalized_stored in normalized_requested:
                 return stored_name
-        # Try word-level matching - if all words from requested are in stored
         requested_words = set(normalized_requested.split())
         for stored_name in available_names:
             normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
             stored_words = set(normalized_stored.split())
-            # Check if requested words are a subset of stored words
             if requested_words.issubset(stored_words):
                 return stored_name
         return None
     def _build_pinecone_filter(self, filters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
-        """
-        Build proper Pinecone metadata filter with fuzzy survey name matching.
-        Pinecone filter syntax:
-        - Simple: {"year": 2025}
-        - Multiple: {"$and": [{"year": 2025}, {"month": "February"}]}
-        """
         if not filters:
             return None
         filter_conditions = []
-        # Handle year filter
-        if "year" in filters:
-            year = filters["year"]
-            if isinstance(year, str):
-                year = int(year)
             filter_conditions.append({"year": {"$eq": year}})
-        # Handle month filter
-        if "month" in filters:
-            month = filters["month"]
-            # Ensure proper capitalization
-            if isinstance(month, str):
-                month = month.capitalize()
             filter_conditions.append({"month": {"$eq": month}})
-        # Handle poll_date filter (exact match)
-        if "poll_date" in filters:
             filter_conditions.append({"poll_date": {"$eq": filters["poll_date"]}})
-        # Handle survey_name filter with fuzzy matching
-        if "survey_name" in filters:
-            requested_name = filters["survey_name"]
-            # Try to fuzzy match the survey name
-            matched_name = self._fuzzy_match_survey_name(requested_name)
             if matched_name:
-                if self.verbose and matched_name != requested_name:
-                    print(f"🔄 Mapped survey name '{requested_name}' → '{matched_name}'")
                 filter_conditions.append({"survey_name": {"$eq": matched_name}})
-            else:
-                if self.verbose:
-                    print(f"⚠️  Survey name '{requested_name}' not found in catalog")
-                    print(f"    Available: {self.get_available_survey_names()}")
-                # Don't add the filter if we can't match it - let other filters work
-        # Handle topics (if a topic is in the comma-separated list)
-        if "topic" in filters:
-            # This is trickier with comma-separated strings in metadata
-            # For now, we'll do post-filtering
-            pass
-        # Combine filters
         if len(filter_conditions) == 0:
             return None
         elif len(filter_conditions) == 1:
@@ -223,123 +176,6 @@ class QuestionnaireRAG:
         else:
             return {"$and": filter_conditions}
-    def _validate_results(
-        self,
-        docs: List[Any],
-        filters: Dict[str, Any]
-    ) -> List[Any]:
-        """
-        Validate that retrieved documents actually match the filters.
-        This catches cases where:
-        1. Pinecone filtering didn't work correctly
-        2. We need to do additional filtering (like topic matching)
-        """
-        if not filters:
-            return docs
-        validated_docs = []
-        for doc in docs:
-            metadata = doc.metadata
-            valid = True
-            # Check year
-            if "year" in filters:
-                expected_year = int(filters["year"]) if isinstance(filters["year"], str) else filters["year"]
-                if metadata.get("year") != expected_year:
-                    if self.verbose:
-                        print(f"⚠️  Filtered out: wrong year {metadata.get('year')} != {expected_year}")
-                    valid = False
-            # Check month
-            if "month" in filters and valid:
-                expected_month = filters["month"].capitalize() if isinstance(filters["month"], str) else filters["month"]
-                if metadata.get("month") != expected_month:
-                    if self.verbose:
-                        print(f"⚠️  Filtered out: wrong month {metadata.get('month')} != {expected_month}")
-                    valid = False
-            # Check poll_date
-            if "poll_date" in filters and valid:
-                if metadata.get("poll_date") != filters["poll_date"]:
-                    if self.verbose:
-                        print(f"⚠️  Filtered out: wrong poll_date {metadata.get('poll_date')} != {filters['poll_date']}")
-                    valid = False
-            # Check survey_name (with fuzzy matching)
-            if "survey_name" in filters and valid:
-                requested_name = filters["survey_name"]
-                matched_name = self._fuzzy_match_survey_name(requested_name)
-                if matched_name and metadata.get("survey_name") != matched_name:
-                    if self.verbose:
-                        print(f"⚠️  Filtered out: wrong survey {metadata.get('survey_name')} != {matched_name}")
-                    valid = False
-            # Check topic (if topic filter is provided)
-            if "topic" in filters and valid:
-                expected_topic = filters["topic"].lower()
-                # Topics are stored as comma-separated string in metadata
-                doc_topics = metadata.get("topics", "")
-                if isinstance(doc_topics, str):
-                    doc_topics_list = [t.strip().lower() for t in doc_topics.split(",")]
-                elif isinstance(doc_topics, list):
-                    doc_topics_list = [str(t).strip().lower() for t in doc_topics]
-                else:
-                    doc_topics_list = []
-                if self.verbose and valid:
-                    var_name = metadata.get("variable_name", "unknown")
-                    print(f"   🔍 Checking topic '{expected_topic}' for {var_name}: doc_topics={doc_topics_list}")
-                if expected_topic not in doc_topics_list:
-                    if self.verbose:
-                        var_name = metadata.get("variable_name", "unknown")
-                        print(f"⚠️  Filtered out {var_name}: topic '{expected_topic}' not in {doc_topics_list}")
-                    valid = False
-            if valid:
-                validated_docs.append(doc)
-        return validated_docs
-    def _get_prompt(self) -> ChatPromptTemplate:
-        """Get the improved system prompt with anti-hallucination measures"""
-        system_prompt_template = _load_prompt_file("questionnaire_rag_prompt.txt")
-        return ChatPromptTemplate.from_messages([
-            ("system", system_prompt_template),
-            ("human", "Answer:")
-        ])
-    def query(self, question: str, filters: Optional[Dict[str, Any]] = None, k: int = 20) -> str:
-        """
-        Query the questionnaire system.
-        Args:
-            question: Natural language question
-            filters: Optional filters (year, month, poll_date, survey_name)
-            k: Number of results to retrieve
-        Returns:
-            Answer string
-        """
-        result = self._query_internal(question, filters, k)
-        return result['answer']
-    def query_with_metadata(
-        self,
-        question: str,
-        filters: Optional[Dict[str, Any]] = None,
-        k: int = 20
-    ) -> Dict[str, Any]:
-        """
-        Query with full metadata about retrieval.
-        Returns:
-            Dict with 'answer', 'source_questions', 'num_sources', 'filters_applied'
-        """
-        return self._query_internal(question, filters, k)
     def retrieve_raw_data(
         self,
         question: str,
@@ -347,250 +183,92 @@ class QuestionnaireRAG:
         k: int = 20
     ) -> Dict[str, Any]:
         """
-        Retrieve raw data without LLM formatting.
-        Used by agent framework to get raw data for synthesis.
         Returns:
-            Dict with 'source_questions', 'num_sources', 'filters_applied', 'retrieved_docs'
         """
         if self.verbose:
-            print(f"\n📊 [Raw Data] Query: {question}")
             if filters:
                 print(f"🔍 Filters: {filters}")
         # Build Pinecone filter
         pinecone_filter = self._build_pinecone_filter(filters or {})
-        # Retrieve documents
         if pinecone_filter:
             if self.verbose:
-                print(f"🔧 Pinecone filter: {pinecone_filter}")
             retriever = self.vectorstore.as_retriever(
                 search_kwargs={"k": k, "filter": pinecone_filter}
             )
-        else:
-            retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
-        docs = retriever.invoke(question)
-        if self.verbose:
-            print(f"📥 Retrieved {len(docs)} documents from Pinecone")
-        # Validate results match filters
-        if filters:
-            docs = self._validate_results(docs, filters)
             if self.verbose:
-                print(f"✅ After validation: {len(docs)} documents")
-        # Check if we have any results
         if not docs:
-            return {
-                "source_questions": [],
-                "num_sources": 0,
-                "filters_applied": filters or {},
-                "retrieved_docs": []
-            }
-        # Reconstruct full questions
-        full_questions = []
-        seen_ids = set()
-        for doc in docs:
-            q_id = doc.metadata.get('question_id')
-            if q_id and q_id not in seen_ids:
-                if q_id in self.questions_by_id:
-                    full_questions.append(self.questions_by_id[q_id])
-                    seen_ids.add(q_id)
-        # Sort by position to maintain survey order
-        full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
-        return {
-            'source_questions': full_questions,
-            'num_sources': len(full_questions),
-            'filters_applied': filters or {},
-            'retrieved_docs': docs
-        }
-    def _query_internal(
-        self,
-        question: str,
-        filters: Optional[Dict[str, Any]] = None,
-        k: int = 20
-    ) -> Dict[str, Any]:
-        """Internal query implementation"""
-        if self.verbose:
-            print(f"\n📊 Query: {question}")
-            if filters:
-                print(f"🔍 Filters: {filters}")
-        # Build Pinecone filter
-        pinecone_filter = self._build_pinecone_filter(filters or {})
-        # Retrieve documents
-        if pinecone_filter:
             if self.verbose:
-                print(f"🔧 Pinecone filter: {pinecone_filter}")
-            retriever = self.vectorstore.as_retriever(
-                search_kwargs={"k": k, "filter": pinecone_filter}
-            )
-        else:
-            retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
-        docs = retriever.invoke(question)
-        if self.verbose:
-            print(f"📥 Retrieved {len(docs)} documents from Pinecone")
-        # Validate results match filters
-        if filters:
-            docs = self._validate_results(docs, filters)
             if self.verbose:
-                print(f"✅ After validation: {len(docs)} documents")
-        # Check if we have any results
         if not docs:
-            no_data_msg = f"No questionnaire data found"
-            if filters:
-                filter_desc = ", ".join([f"{k}={v}" for k, v in filters.items()])
-                no_data_msg += f" matching filters: {filter_desc}"
             return {
-                "answer": no_data_msg,
                 "source_questions": [],
                 "num_sources": 0,
-                "filters_applied": filters or {}
             }
-        # Reconstruct full questions
         full_questions = []
         seen_ids = set()
         for doc in docs:
             q_id = doc.metadata.get('question_id')
             if q_id and q_id not in seen_ids:
                 if q_id in self.questions_by_id:
-                    full_questions.append(self.questions_by_id[q_id])
                     seen_ids.add(q_id)
-        # Sort by position to maintain survey order
         full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
-        # Format context with explicit data availability info
-        context = self._format_context(full_questions, filters)
-        # Get prompt
-        prompt = self._get_prompt()
-        # Create chain
-        chain = (
-            {
-                "context": lambda x: context,
-                "question": lambda x: question,
-                "catalog": lambda x: self._get_catalog_summary()
-            }
-            | prompt
-            | self.llm
-            | StrOutputParser()
-        )
-        # Get answer
-        answer = chain.invoke(question)
         return {
-            'answer': answer,
             'source_questions': full_questions,
             'num_sources': len(full_questions),
-            'filters_applied': filters or {}
         }
-    def _format_context(
-        self,
-        questions: List[Dict],
-        filters: Optional[Dict[str, Any]] = None
-    ) -> str:
-        """Format questions as context with explicit data availability"""
-        if not questions:
-            filter_desc = ""
-            if filters:
-                filter_desc = f" matching {filters}"
-            return f"⚠️ NO DATA RETRIEVED{filter_desc}\n\nYou must inform the user that no data exists for their query."
-        context_parts = []
-        # Add explicit note about what data we have
-        polls_found = sorted(set(q['poll_date'] for q in questions))
-        context_parts.append(f"✅ DATA AVAILABLE FOR: {', '.join(polls_found)}")
-        # Add note about what was requested vs what was found
-        if filters:
-            if 'year' in filters and 'month' in filters:
-                requested = f"{filters['month']} {filters['year']}"
-                context_parts.append(f"🔍 REQUESTED: {requested}")
-        context_parts.append("")  # Blank line
-        context_parts.append("=" * 80)
-        context_parts.append("")
-        # Format each question
-        for i, q in enumerate(questions, 1):
-            part = f"""
---- Question {i} from {q['survey_name']} ({q['poll_date']}) ---
-Variable: {q['variable_name']}
-Question: {q['question_text']}
-Response Options: {' | '.join(q['response_options'])}
-Topics: {', '.join(q['topics'])}
-Question Type: {q['question_type']}
-Administration: {q['ask_condition']}
-"""
-            # Add skip logic/sampling
-            if q.get('skip_logic'):
-                part += f"Skip Logic: {q['skip_logic']}\n"
-            if q.get('half_sample_group'):
-                part += f"Half Sample Group: {q['half_sample_group']}\n"
-            # Add sibling variants
-            if q.get('sibling_variants'):
-                part += f"\nAlternate Versions (shown to different groups):\n"
-                for sib in q['sibling_variants']:
-                    sib_group = sib.get('half_sample_group', 'other group')
-                    part += f"  - [{sib_group}] {sib['question_text']}\n"
-            # Add sequence context
-            if q.get('previous_question'):
-                prev_vars = q.get('previous_question_variants', [])
-                if len(prev_vars) > 1:
-                    part += "\nPrevious Question (respondents saw one of these):\n"
-                    for pv in prev_vars:
-                        part += f"  - {pv['question_text']}\n"
-                else:
-                    part += f"\nPrevious Question: {q['previous_question']['question_text']}\n"
-            if q.get('next_question'):
-                next_vars = q.get('next_question_variants', [])
-                if len(next_vars) > 1:
-                    part += "\nNext Question (respondents saw one of these):\n"
-                    for nv in next_vars:
-                        part += f"  - {nv['question_text']}\n"
-                else:
-                    part += f"\nNext Question: {q['next_question']['question_text']}\n"
-            context_parts.append(part.strip())
-        return "\n\n".join(context_parts)
-    def _get_catalog_summary(self) -> str:
-        """Get summary of available polls"""
-        lines = ["Available polls:"]
-        for poll_date in sorted(self.poll_catalog.keys()):
-            info = self.poll_catalog[poll_date]
-            month_str = f" ({info['month']})" if info.get('month') else ""
-            lines.append(f"- {poll_date}{month_str}: {info['num_questions']} questions")
-        return "\n".join(lines)
     def get_available_polls(self) -> List[Dict[str, Any]]:
         """Get list of all available polls"""
@@ -605,51 +283,3 @@ Administration: {q['ask_condition']}
             for poll_date, info in sorted(self.poll_catalog.items())
         ]
-def main():
-    """Test CLI"""
-    import sys
-    openai_api_key = os.getenv("OPENAI_API_KEY")
-    pinecone_api_key = os.getenv("PINECONE_API_KEY")
-    if not openai_api_key or not pinecone_api_key:
-        print("Error: Missing API keys")
-        sys.exit(1)
-    rag = QuestionnaireRAG(
-        openai_api_key=openai_api_key,
-        pinecone_api_key=pinecone_api_key,
-        verbose=True
-    )
-    print("\n" + "="*80)
-    print("QUESTIONNAIRE RAG - TEST MODE")
-    print("="*80)
-    # Test fuzzy matching
-    print("\n🧪 TEST: Fuzzy survey name matching")
-    test_names = ["Unity Poll", "unity poll", "Vanderbilt Unity", "UNITY"]
-    for name in test_names:
-        matched = rag._fuzzy_match_survey_name(name)
-        print(f"  '{name}' → '{matched}'")
-    # Test with the problematic query
-    print("\n🧪 TEST: Query that previously failed")
-    print("Query: What questions were asked in the June 2025 Unity Poll?")
-    filters = {"year": 2025, "month": "June", "survey_name": "Unity Poll"}
-    result = rag.query_with_metadata(
-        "What questions were asked in the June 2025 Unity Poll?",
-        filters=filters
-    )
-    print(f"\n📊 Results:")
-    print(f"Found: {result['num_sources']} questions")
-    print(f"\n{result['answer'][:500]}...")
-    print("\n" + "="*80)
-if __name__ == "__main__":
-    main()

 """
+Questionnaire RAG Module
+------------------------
+Retrieves survey questions from Pinecone vectorstore.
+Metadata filtering first, semantic search fallback.
+Returns raw data only - no synthesis.
 """
 import os
 from typing import List, Dict, Any, Optional
 from pathlib import Path
+from langchain_openai import OpenAIEmbeddings
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
 try:
     from dotenv import load_dotenv
     pass
+class QuestionInfo:
+    """Structured question information for cross-pipeline coordination."""
+    def __init__(self, variable_name: str, year: Optional[int] = None,
+                 month: Optional[str] = None, poll_date: Optional[str] = None,
+                 question_id: Optional[str] = None):
+        self.variable_name = variable_name
+        self.year = year
+        self.month = month
+        self.poll_date = poll_date
+        self.question_id = question_id
+    def to_dict(self) -> Dict[str, Any]:
+        return {
+            "variable_name": self.variable_name,
+            "year": self.year,
+            "month": self.month,
+            "poll_date": self.poll_date,
+            "question_id": self.question_id
+        }
 class QuestionnaireRAG:
+    """Questionnaire RAG with metadata-first filtering."""
     def __init__(
         self,
             model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
         )
         # Connect to Pinecone
         index_name = os.getenv("PINECONE_INDEX_NAME", "poll-questionnaire-index")
         namespace = os.getenv("PINECONE_NAMESPACE") or None
     def _load_catalog(self) -> Dict[str, Dict]:
         """Load poll catalog"""
         catalog_path = Path(self.persist_directory) / "poll_catalog.json"
+        if not catalog_path.exists():
+            # Try parent directory if not found
+            parent_path = Path(self.persist_directory).parent / "questionnaire_vectorstores" / "poll_catalog.json"
+            if parent_path.exists():
+                catalog_path = parent_path
+            else:
+                return {}
+        with open(catalog_path, 'r') as f:
+            return json.load(f)
     def _load_questions_index(self) -> Dict[str, Dict]:
         """Load questions index"""
         questions_path = Path(self.persist_directory) / "questions_index.json"
+        if not questions_path.exists():
+            # Try parent directory if not found
+            parent_path = Path(self.persist_directory).parent / "questionnaire_vectorstores" / "questions_index.json"
+            if parent_path.exists():
+                questions_path = parent_path
+            else:
+                return {}
+        with open(questions_path, 'r') as f:
+            return json.load(f)
     def _fuzzy_match_survey_name(self, requested_name: str) -> Optional[str]:
+        """Fuzzy match survey name"""
+        available_names = set()
+        for info in self.poll_catalog.values():
+            available_names.add(info["survey_name"])
         normalized_requested = requested_name.lower().replace("_", " ").replace("-", " ")
         for stored_name in available_names:
             normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
             if normalized_requested == normalized_stored:
                 return stored_name
+            if normalized_requested in normalized_stored or normalized_stored in normalized_requested:
                 return stored_name
         requested_words = set(normalized_requested.split())
         for stored_name in available_names:
             normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
             stored_words = set(normalized_stored.split())
             if requested_words.issubset(stored_words):
                 return stored_name
         return None
     def _build_pinecone_filter(self, filters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+        """Build Pinecone metadata filter"""
         if not filters:
             return None
         filter_conditions = []
+        if "year" in filters and filters["year"] is not None:
+            year = int(filters["year"]) if isinstance(filters["year"], str) else filters["year"]
             filter_conditions.append({"year": {"$eq": year}})
+        if "month" in filters and filters["month"] is not None:
+            month = filters["month"].capitalize()
             filter_conditions.append({"month": {"$eq": month}})
+        if "poll_date" in filters and filters["poll_date"] is not None:
             filter_conditions.append({"poll_date": {"$eq": filters["poll_date"]}})
+        if "survey_name" in filters and filters["survey_name"] is not None:
+            matched_name = self._fuzzy_match_survey_name(filters["survey_name"])
             if matched_name:
                 filter_conditions.append({"survey_name": {"$eq": matched_name}})
+        if "question_ids" in filters and filters["question_ids"]:
+            question_ids = filters["question_ids"]
+            if isinstance(question_ids, list) and len(question_ids) > 0:
+                if len(question_ids) == 1:
+                    filter_conditions.append({"question_id": {"$eq": question_ids[0]}})
+                else:
+                    filter_conditions.append({"question_id": {"$in": question_ids}})
+        if "topic" in filters and filters["topic"]:
+            topic = filters["topic"].lower()
+            filter_conditions.append({"topics": {"$in": [topic]}})
         if len(filter_conditions) == 0:
             return None
         elif len(filter_conditions) == 1:
         else:
             return {"$and": filter_conditions}
     def retrieve_raw_data(
         self,
         question: str,
         k: int = 20
     ) -> Dict[str, Any]:
         """
+        Retrieve raw questionnaire data.
+        Metadata filtering first, semantic search fallback.
         Returns:
+            Dict with 'source_questions', 'num_sources', 'filters_applied', 'question_info'
         """
         if self.verbose:
+            print(f"\n📊 [Questionnaire] Query: {question}")
             if filters:
                 print(f"🔍 Filters: {filters}")
         # Build Pinecone filter
         pinecone_filter = self._build_pinecone_filter(filters or {})
+        # Try metadata filtering first
+        docs = []
         if pinecone_filter:
             if self.verbose:
+                print(f"🔧 Using metadata filter: {pinecone_filter}")
             retriever = self.vectorstore.as_retriever(
                 search_kwargs={"k": k, "filter": pinecone_filter}
             )
+            docs = retriever.invoke(question)
             if self.verbose:
+                print(f"📥 Retrieved {len(docs)} documents with metadata filter")
+        # Fallback to semantic search if no results
         if not docs:
             if self.verbose:
+                print(f"⚠️  No results with metadata filter, falling back to semantic search")
+            retriever = self.vectorstore.as_retriever(search_kwargs={"k": k * 2})
+            docs = retriever.invoke(question)
             if self.verbose:
+                print(f"📥 Retrieved {len(docs)} documents with semantic search")
         if not docs:
             return {
                 "source_questions": [],
                 "num_sources": 0,
+                "filters_applied": filters or {},
+                "question_info": []
             }
+        # Reconstruct full questions and extract question_info
         full_questions = []
         seen_ids = set()
+        question_info_list = []
         for doc in docs:
             q_id = doc.metadata.get('question_id')
             if q_id and q_id not in seen_ids:
                 if q_id in self.questions_by_id:
+                    q_data = self.questions_by_id[q_id]
+                    full_questions.append(q_data)
                     seen_ids.add(q_id)
+                    # Extract question_info
+                    question_info_list.append(QuestionInfo(
+                        variable_name=q_data.get("variable_name", ""),
+                        year=q_data.get("year"),
+                        month=q_data.get("month", ""),
+                        poll_date=q_data.get("poll_date", ""),
+                        question_id=q_id
+                    ))
+        # Sort by position
         full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
+        if self.verbose:
+            print(f"✅ Extracted {len(question_info_list)} question info entries")
         return {
             'source_questions': full_questions,
             'num_sources': len(full_questions),
+            'filters_applied': filters or {},
+            'question_info': [q.to_dict() for q in question_info_list]
         }
+    def get_available_survey_names(self) -> List[str]:
+        """Get list of unique survey names"""
+        survey_names = set()
+        for info in self.poll_catalog.values():
+            survey_names.add(info["survey_name"])
+        return sorted(survey_names)
     def get_available_polls(self) -> List[Dict[str, Any]]:
         """Get list of all available polls"""
             for poll_date, info in sorted(self.poll_catalog.items())
         ]

relevance_checker.py ADDED Viewed

	@@ -0,0 +1,248 @@

+"""
+Conversation Relevance Checker
+-------------------------------
+Determines if current question is related to previous conversation
+and identifies what data can be reused to minimize redundant API calls.
+"""
+import os
+from typing import List, Dict, Any, Optional, Literal
+from pathlib import Path
+from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
+from pydantic import BaseModel, Field
+def _load_prompt_file(filename: str) -> str:
+    """Load a prompt file from the prompts directory"""
+    prompt_dir = Path(__file__).parent / "prompts"
+    prompt_path = prompt_dir / filename
+    if not prompt_path.exists():
+        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
+    return prompt_path.read_text(encoding="utf-8")
+class ReusableData(BaseModel):
+    """Indicates what data can be reused from previous conversation"""
+    questions: bool = False
+    toplines: bool = False
+    crosstabs: bool = False
+class RelevanceResult(BaseModel):
+    """Structured relevance assessment result"""
+    is_related: bool
+    relation_type: Literal[
+        "same_topic_different_demo",
+        "same_topic_different_time",
+        "trend_analysis",
+        "new_topic"
+    ]
+    reusable_data: ReusableData
+    time_period_changed: bool
+    reasoning: str
+class ConversationRelevanceChecker:
+    """
+    Checks relevance between current question and conversation history.
+    Uses LLM to determine if previous data can be reused.
+    """
+    def __init__(self, llm, verbose: bool = False):
+        """
+        Initialize relevance checker.
+        Args:
+            llm: LangChain LLM instance (ChatOpenAI)
+            verbose: Whether to print debug information
+        """
+        self.llm = llm
+        self.verbose = verbose
+        # Load relevance check prompt
+        try:
+            self.prompt_template = _load_prompt_file("relevance_check_prompt.txt")
+        except FileNotFoundError:
+            # Fallback to inline prompt if file doesn't exist yet
+            self.prompt_template = self._get_default_prompt()
+    def _get_default_prompt(self) -> str:
+        """Fallback prompt template if file doesn't exist"""
+        return """You are analyzing conversation continuity in a multi-turn survey data analysis system.
+Your task: Determine if the current question is related to previous conversation and what data can be reused.
+## CONVERSATION HISTORY
+{conversation_summary}
+## PREVIOUSLY RETRIEVED DATA
+{previous_data_summary}
+## CURRENT QUESTION
+{current_question}
+## ANALYSIS REQUIRED
+1. **Is the current question related to the previous conversation?**
+   - YES if: Same topic, same questions, same time period (even if different demographic)
+   - YES if: Asking for trend/analysis of already-shown data
+   - NO if: Completely different topic
+   - NO if: Same topic but different time period (e.g., June 2025 → February 2025)
+2. **Relation Type** (if related):
+   - `same_topic_different_demo`: Same topic/questions, asking for different demographic breakdown
+   - `trend_analysis`: Asking for analysis/trends from already-retrieved data
+   - `same_topic_different_time`: Same topic but different time period
+   - `new_topic`: Completely different topic
+3. **Reusable Data**:
+   - `questions`: true if same questions can be reused (same topic, same time period)
+   - `toplines`: true if overall frequencies already retrieved and still relevant
+   - `crosstabs`: true if demographic breakdowns already retrieved and still relevant
+4. **Time Period Changed**:
+   - true if current question asks about different year/month than previous
+   - false if time period is same or not specified
+Respond with structured output."""
+    def _build_conversation_summary(self, conversation_history: List) -> str:
+        """Build a summary of conversation history for the prompt"""
+        summary_lines = []
+        for msg in conversation_history:
+            if isinstance(msg, HumanMessage):
+                summary_lines.append(f"USER: {msg.content}")
+            elif isinstance(msg, AIMessage):
+                # Truncate long AI responses
+                content = msg.content
+                if len(content) > 300:
+                    content = content[:300] + "... (truncated)"
+                summary_lines.append(f"ASSISTANT: {content}")
+        return "\n".join(summary_lines) if summary_lines else "No previous conversation"
+    def _build_previous_data_summary(self, previous_stage_results: List) -> str:
+        """Build a summary of previously retrieved data"""
+        if not previous_stage_results:
+            return "No previous data retrieved"
+        summary_lines = []
+        for i, stage_result in enumerate(previous_stage_results, 1):
+            summary_lines.append(f"Stage {i}:")
+            # Questionnaire results
+            if stage_result.questionnaire_results:
+                q_res = stage_result.questionnaire_results
+                num_questions = len(q_res.get("source_questions", []))
+                question_info = q_res.get("question_info", [])
+                if question_info:
+                    sample_vars = [q.get("variable_name", "unknown") for q in question_info[:3]]
+                    sample_vars_str = ", ".join(sample_vars)
+                    if len(question_info) > 3:
+                        sample_vars_str += f" ... and {len(question_info) - 3} more"
+                    # Extract time period info
+                    time_info = []
+                    if question_info[0].get("year"):
+                        time_info.append(str(question_info[0]["year"]))
+                    if question_info[0].get("month"):
+                        time_info.append(question_info[0]["month"])
+                    time_str = " ".join(time_info) if time_info else "unspecified time"
+                    summary_lines.append(f"  - Retrieved {num_questions} question(s) from {time_str}")
+                    summary_lines.append(f"  - Variables: {sample_vars_str}")
+            # Toplines results
+            if stage_result.toplines_results:
+                t_res = stage_result.toplines_results
+                num_docs = len(t_res.get("retrieved_docs", []))
+                summary_lines.append(f"  - Retrieved {num_docs} topline document(s)")
+            # Crosstabs results
+            if stage_result.crosstabs_results:
+                c_res = stage_result.crosstabs_results
+                if "crosstab_docs_by_variable" in c_res:
+                    num_vars = len(c_res["crosstab_docs_by_variable"])
+                    summary_lines.append(f"  - Retrieved crosstabs for {num_vars} variable(s)")
+        return "\n".join(summary_lines) if summary_lines else "No data summary available"
+    def check_relevance(
+        self,
+        current_question: str,
+        conversation_history: List,
+        previous_stage_results: List
+    ) -> Dict[str, Any]:
+        """
+        Check relevance of current question to previous conversation.
+        Args:
+            current_question: The current user question
+            conversation_history: List of previous messages (HumanMessage, AIMessage)
+            previous_stage_results: List of StageResult objects from previous turns
+        Returns:
+            Dict with relevance assessment (is_related, relation_type, reusable_data, etc.)
+        """
+        if self.verbose:
+            print("\n🔍 Checking conversation relevance...")
+        # Build prompt inputs
+        conversation_summary = self._build_conversation_summary(conversation_history)
+        previous_data_summary = self._build_previous_data_summary(previous_stage_results)
+        # Use simple string replacement instead of .format() to avoid issues with curly braces
+        prompt = self.prompt_template.replace("{conversation_summary}", conversation_summary)
+        prompt = prompt.replace("{previous_data_summary}", previous_data_summary)
+        prompt = prompt.replace("{current_question}", current_question)
+        # Get structured output from LLM
+        try:
+            relevance_checker = self.llm.with_structured_output(RelevanceResult)
+            result = relevance_checker.invoke([
+                SystemMessage(content="You are a conversation continuity analyzer for survey data systems."),
+                HumanMessage(content=prompt)
+            ])
+            if self.verbose:
+                print(f"   Related: {result.is_related}")
+                print(f"   Type: {result.relation_type}")
+                print(f"   Reusable: questions={result.reusable_data.questions}, "
+                      f"toplines={result.reusable_data.toplines}, "
+                      f"crosstabs={result.reusable_data.crosstabs}")
+                print(f"   Time changed: {result.time_period_changed}")
+                print(f"   Reasoning: {result.reasoning}")
+            return {
+                "is_related": result.is_related,
+                "relation_type": result.relation_type,
+                "reusable_data": {
+                    "questions": result.reusable_data.questions,
+                    "toplines": result.reusable_data.toplines,
+                    "crosstabs": result.reusable_data.crosstabs
+                },
+                "time_period_changed": result.time_period_changed,
+                "reasoning": result.reasoning
+            }
+        except Exception as e:
+            if self.verbose:
+                print(f"   ⚠️  Error checking relevance: {e}")
+            # Return safe default (treat as new topic)
+            return {
+                "is_related": False,
+                "relation_type": "new_topic",
+                "reusable_data": {
+                    "questions": False,
+                    "toplines": False,
+                    "crosstabs": False
+                },
+                "time_period_changed": False,
+                "reasoning": f"Error during relevance check: {str(e)}"
+            }

survey_agent.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

toplines_rag.py CHANGED Viewed

@@ -1,51 +1,43 @@
 """
-ToplinesRAG
------------
-Queries the prebuilt Pinecone toplines vectorstore and synthesizes
-a natural-language answer with citations using OpenAI.
 """
 import os
-import re
-from pathlib import Path
 from typing import Any, Dict, List, Optional
 from dotenv import load_dotenv
-from langchain_openai import OpenAIEmbeddings, ChatOpenAI
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
-from calendar import month_name
 load_dotenv()
-def _load_prompt_file(filename: str) -> str:
-    """Load a prompt file from the prompts directory"""
-    prompt_dir = Path(__file__).parent / "prompts"
-    prompt_path = prompt_dir / filename
-    if not prompt_path.exists():
-        raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
-    return prompt_path.read_text(encoding="utf-8")
 class ToplinesRAG:
     def __init__(
         self,
-        persist_directory: str = "./toplines_vectorstores",
         index_name: Optional[str] = None,
         llm_model: str = "gpt-4-turbo",
     ):
-        self.persist_directory = Path(persist_directory)
         self.index_name = index_name or os.getenv("PINECONE_INDEX_NAME_TOPLINES", "toplines-index")
         self.namespace = os.getenv("PINECONE_NAMESPACE") or None
         self.openai_api_key = os.getenv("OPENAI_API_KEY")
         if not self.openai_api_key:
             raise ValueError("OPENAI_API_KEY not set")
-        pinecone_api_key = os.getenv("PINECONE_API_KEY_TOPLINES")
         if not pinecone_api_key:
-            raise ValueError("PINECONE_API_KEY_TOPLINES not set")
         self.embeddings = OpenAIEmbeddings(
             model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
@@ -56,166 +48,145 @@ class ToplinesRAG:
             index=self.index, embedding=self.embeddings, namespace=self.namespace
         )
-        self.llm_model = llm_model
-        self.llm = ChatOpenAI(
-            model=self.llm_model,
-            openai_api_key=self.openai_api_key,
-            temperature=0
-        )
-    # ----------------------------------------------------------
-    def _build_filter(self, filters: Dict[str, Any]) -> Optional[Dict]:
         """
-        Build Pinecone filter from filters dict.
-        Only includes valid metadata fields that exist in the vectorstore.
-        Ignores unsupported fields like 'topic', 'question_ids', etc.
         """
-        if not filters:
             return None
-        # Valid filter fields that exist in toplines metadata
-        VALID_FILTER_FIELDS = {"year", "month", "poll_date", "survey_name"}
-        # Filter to only include valid fields
         valid_filters = {k: v for k, v in filters.items()
                         if k in VALID_FILTER_FIELDS and v is not None}
         if not valid_filters:
             return None
-        clauses = [{k: {"$eq": str(v)}} for k, v in valid_filters.items()]
-        return {"$and": clauses} if len(clauses) > 1 else clauses[0]
-    # ----------------------------------------------------------
-    def _extract_filters_from_query(self, query: str) -> Dict[str, str]:
-        filters = {}
-        year_match = re.search(r"20\d{2}", query)
-        if year_match:
-            filters["year"] = year_match.group()
-        for i in range(1, 13):
-            if month_name[i].lower() in query.lower():
-                filters["month"] = month_name[i]
-                break
-        return filters
-    # ----------------------------------------------------------
-    def _synthesize_answer(self, query: str, docs: List[Dict]) -> str:
-        """Generate a human-readable answer from the retrieved docs."""
-        if not docs:
-            # No docs retrieved → truly irrelevant query
-            return (
-                "Your query does not match any Vanderbilt Unity Poll data. "
-                "This system only provides information from those polls."
-            )
-        # Format retrieved documents for context
-        context_snippets = "\n\n".join(
-            f"Survey: {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
-            f"({d.metadata.get('month', '')} {d.metadata.get('year', '')})\n"
-            f"Question: {d.metadata.get('variable_name', '')}\n"
-            f"Response: {d.metadata.get('response_label', '')}\n"
-            f"Pct: {d.metadata.get('pct', 'N/A')}\n"
-            f"Poll Date: {d.metadata.get('poll_date', 'N/A')}"
-            for d in docs
-        )
-        # Load prompt from file
-        prompt_template = _load_prompt_file("toplines_rag_prompt.txt")
-        prompt = prompt_template.format(
-            query=query,
-            context_snippets=context_snippets
-        )
-        completion = self.llm.invoke(prompt)
-        answer_text = completion.content.strip()
-        # Build sources section
-        sources = [
-            f"- {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
-            f"({d.metadata.get('poll_date', 'N/A')}) | Variable: {d.metadata.get('variable_name', 'N/A')}"
-            for d in docs
-        ]
-        return f"\n--- ANSWER ---\n\n{answer_text}\n\n--- SOURCES ---\n" + "\n".join(sources)
-    # ----------------------------------------------------------
-    def query_toplines(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> str:
-        pinecone_filter = self._build_filter(filters or {})
-        # Try with filters first, but if no results, try without filters to see if data exists
-        docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
-        # If no results with filters but filters were provided, try a broader search
-        if not docs and pinecone_filter:
-            # Try without filters to see if the query matches anything
-            docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
-            if docs_no_filter:
-                # Filter results manually by matching metadata
-                valid_filters = {k: str(v) for k, v in (filters or {}).items()
-                               if k in {"year", "month", "poll_date", "survey_name"} and v}
-                docs = [
-                    d for d in docs_no_filter
-                    if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
-                ]
-                # If still no matches after manual filtering, use the broader results
-                if not docs:
-                    docs = docs_no_filter[:top_k]
-        return self._synthesize_answer(query, docs)
-    # ----------------------------------------------------------
-    def retrieve_raw_data(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> Dict[str, Any]:
         """
-        Retrieve raw data without LLM synthesis.
-        Used by agent framework to get raw data for synthesis.
         Returns:
-            Dict with 'retrieved_docs', 'num_sources', 'filters_applied'
         """
-        pinecone_filter = self._build_filter(filters or {})
-        # Try with filters first, but if no results, try without filters to see if data exists
-        docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
-        # If no results with filters but filters were provided, try a broader search
-        if not docs and pinecone_filter:
-            # Try without filters to see if the query matches anything
-            docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
-            if docs_no_filter:
-                # Filter results manually by matching metadata
-                valid_filters = {k: str(v) for k, v in (filters or {}).items()
-                               if k in {"year", "month", "poll_date", "survey_name"} and v}
-                docs = [
-                    d for d in docs_no_filter
-                    if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
-                ]
-                # If still no matches after manual filtering, use the broader results
-                if not docs:
-                    docs = docs_no_filter[:top_k]
         return {
             "retrieved_docs": docs,
             "num_sources": len(docs),
-            "filters_applied": filters or {}
         }
-    # ----------------------------------------------------------
-    def interactive_loop(self):
-        print("ToplinesRAG ready! Type 'quit' or 'exit' to stop.\n")
-        while True:
-            query = input("Enter your poll question: ").strip()
-            if query.lower() in ("quit", "exit"):
-                print("Exiting ToplinesRAG. Goodbye!")
-                break
-            filters = self._extract_filters_from_query(query)
-            if filters:
-                print(f"Using filters: {filters}")
-            print("\nRetrieving answer...\n")
-            answer = self.query_toplines(query, filters=filters)
-            print(answer)
-            print("\n" + "-"*60 + "\n")
-if __name__ == "__main__":
-    rag = ToplinesRAG()
-    rag.interactive_loop()

 """
+Toplines RAG Module
+-------------------
+Retrieves topline response frequency data from Pinecone vectorstore.
+Uses question_info for precise metadata filtering.
+Returns raw data only - no synthesis.
 """
 import os
 from typing import Any, Dict, List, Optional
+from pathlib import Path
 from dotenv import load_dotenv
+from langchain_openai import OpenAIEmbeddings
 from langchain_pinecone import PineconeVectorStore
 from pinecone import Pinecone
 load_dotenv()
 class ToplinesRAG:
+    """Toplines RAG with question_info-based metadata filtering."""
     def __init__(
         self,
         index_name: Optional[str] = None,
         llm_model: str = "gpt-4-turbo",
+        verbose: bool = False
     ):
         self.index_name = index_name or os.getenv("PINECONE_INDEX_NAME_TOPLINES", "toplines-index")
         self.namespace = os.getenv("PINECONE_NAMESPACE") or None
+        self.verbose = verbose
         self.openai_api_key = os.getenv("OPENAI_API_KEY")
         if not self.openai_api_key:
             raise ValueError("OPENAI_API_KEY not set")
+        pinecone_api_key = os.getenv("PINECONE_API_KEY")
         if not pinecone_api_key:
+            raise ValueError("PINECONE_API_KEY not set")
         self.embeddings = OpenAIEmbeddings(
             model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
             index=self.index, embedding=self.embeddings, namespace=self.namespace
         )
+    def _build_filter_from_question_info(self, question_info_list: List[Dict[str, Any]]) -> Optional[Dict]:
         """
+        Build Pinecone filter from question_info list.
+        Matches on variable + year + month combination (no poll_date).
         """
+        if not question_info_list:
+            return None
+        # Build filter conditions for each question_info
+        filter_clauses = []
+        for q_info in question_info_list:
+            conditions = []
+            var_name = q_info.get("variable_name")
+            if var_name:
+                # Match on "variable" field (Pinecone stores short code like "VAND5" in "variable" field)
+                # Also check "variable_name" as fallback
+                var_conditions = [
+                    {"variable": {"$eq": var_name}},
+                    {"variable_name": {"$eq": var_name}}
+                ]
+                conditions.append({"$or": var_conditions})
+            year = q_info.get("year")
+            if year:
+                # Pinecone stores year as integer
+                conditions.append({"year": {"$eq": int(year)}})
+            month = q_info.get("month")
+            if month:
+                # Pinecone stores month as string (capitalized like "March", "June")
+                # Ensure month is capitalized to match Pinecone format
+                month_str = str(month).capitalize()
+                conditions.append({"month": {"$eq": month_str}})
+            if conditions:
+                # Combine conditions with $and for this question
+                if len(conditions) == 1:
+                    filter_clauses.append(conditions[0])
+                else:
+                    filter_clauses.append({"$and": conditions})
+        if not filter_clauses:
             return None
+        # Combine all question filters with $or
+        if len(filter_clauses) == 1:
+            return filter_clauses[0]
+        else:
+            return {"$or": filter_clauses}
+    def _build_filter_from_filters(self, filters: Dict[str, Any]) -> Optional[Dict]:
+        """Build Pinecone filter from filters dict (for direct queries without question_info)"""
+        if not filters:
+            return None
+        # Only use year and month (no poll_date)
+        VALID_FILTER_FIELDS = {"year", "month", "survey_name"}
         valid_filters = {k: v for k, v in filters.items()
                         if k in VALID_FILTER_FIELDS and v is not None}
         if not valid_filters:
             return None
+        clauses = []
+        for k, v in valid_filters.items():
+            if k == "year":
+                # Pinecone stores year as integer
+                clauses.append({k: {"$eq": int(v)}})
+            elif k == "month":
+                # Pinecone stores month as string (capitalized)
+                clauses.append({k: {"$eq": str(v).capitalize()}})
+            else:
+                # survey_name as string
+                clauses.append({k: {"$eq": str(v)}})
+        return {"$and": clauses} if len(clauses) > 1 else clauses[0]
+    def retrieve_raw_data(
+        self,
+        query: str,
+        question_info: Optional[List[Dict[str, Any]]] = None,
+        source_questions: Optional[List[Dict[str, Any]]] = None,
+        filters: Optional[Dict[str, Any]] = None,
+        top_k: int = 10
+    ) -> Dict[str, Any]:
         """
+        Retrieve raw topline data.
+        Uses question_info for metadata filtering if provided, otherwise uses filters.
+        Falls back to semantic search if metadata filtering returns no results.
+        Args:
+            query: User's query (used for semantic search fallback)
+            question_info: List of question info dicts with variable_name, year, month, poll_date
+            source_questions: Optional list of full question dicts from previous stage (for reference)
+            filters: Optional filters dict (used if question_info not provided)
+            top_k: Number of results to retrieve
         Returns:
+            Dict with 'retrieved_docs', 'num_sources', 'filters_applied', 'source_questions'
         """
+        if self.verbose:
+            print(f"\n📊 [Toplines] Query: {query}")
+            if question_info:
+                print(f"🔍 Question info: {len(question_info)} question(s)")
+            if filters:
+                print(f"🔍 Filters: {filters}")
+        # Build filter from question_info (preferred) or filters
+        pinecone_filter = None
+        if question_info:
+            pinecone_filter = self._build_filter_from_question_info(question_info)
+        elif filters:
+            pinecone_filter = self._build_filter_from_filters(filters)
+        # Try metadata filtering first
+        docs = []
+        if pinecone_filter:
+            if self.verbose:
+                print(f"🔧 Using metadata filter: {pinecone_filter}")
+            docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
+            if self.verbose:
+                print(f"📥 Retrieved {len(docs)} documents with metadata filter")
+        # Fallback to semantic search if no results
+        if not docs:
+            if self.verbose:
+                print(f"⚠️  No results with metadata filter, falling back to semantic search")
+            docs = self.vector_store.similarity_search(query, k=top_k * 2)
+            if self.verbose:
+                print(f"📥 Retrieved {len(docs)} documents with semantic search")
         return {
             "retrieved_docs": docs,
             "num_sources": len(docs),
+            "filters_applied": filters or {},
+            "question_info_used": question_info or [],
+            "source_questions": source_questions or []
         }

toplines_vectorstores/poll_catalog_toplines.json CHANGED Viewed

@@ -1,10 +1,50 @@
 {
   "2025-February": {
     "file": "toplines_data/Vanderbilt_Unity_Poll_2025_February_toplines.json",
     "poll_date": "2025-February",
-    "num_toplines": 41,
     "survey_name": "Vanderbilt Unity Poll",
-    "year": "2025",
     "month": "February"
   },
   "2025-June": {
@@ -12,7 +52,7 @@
     "poll_date": "2025-June",
     "num_toplines": 167,
     "survey_name": "Vanderbilt Unity Poll",
-    "year": "2025",
     "month": "June"
   }
 }

 {
+  "2023-June": {
+    "file": "toplines_data/Vanderbilt_Unity_Poll_2023_June_toplines.json",
+    "poll_date": "2023-June",
+    "num_toplines": 82,
+    "survey_name": "Vanderbilt Unity Poll",
+    "year": 2023,
+    "month": "June"
+  },
+  "2023-March": {
+    "file": "toplines_data/Vanderbilt_Unity_Poll_2023_March_toplines.json",
+    "poll_date": "2023-March",
+    "num_toplines": 40,
+    "survey_name": "Vanderbilt Unity Poll",
+    "year": 2023,
+    "month": "March"
+  },
+  "2024-March": {
+    "file": "toplines_data/Vanderbilt_Unity_Poll_2024_March_toplines.json",
+    "poll_date": "2024-March",
+    "num_toplines": 58,
+    "survey_name": "Vanderbilt Unity Poll",
+    "year": 2024,
+    "month": "March"
+  },
+  "2024-October": {
+    "file": "toplines_data/Vanderbilt_Unity_Poll_2024_October_toplines.json",
+    "poll_date": "2024-October",
+    "num_toplines": 69,
+    "survey_name": "Vanderbilt Unity Poll",
+    "year": 2024,
+    "month": "October"
+  },
+  "2024-September": {
+    "file": "toplines_data/Vanderbilt_Unity_Poll_2024_September_toplines.json",
+    "poll_date": "2024-September",
+    "num_toplines": 80,
+    "survey_name": "Vanderbilt Unity Poll",
+    "year": 2024,
+    "month": "September"
+  },
   "2025-February": {
     "file": "toplines_data/Vanderbilt_Unity_Poll_2025_February_toplines.json",
     "poll_date": "2025-February",
+    "num_toplines": 95,
     "survey_name": "Vanderbilt Unity Poll",
+    "year": 2025,
     "month": "February"
   },
   "2025-June": {
     "poll_date": "2025-June",
     "num_toplines": 167,
     "survey_name": "Vanderbilt Unity Poll",
+    "year": 2025,
     "month": "June"
   }
 }

toplines_vectorstores/toplines_index.json CHANGED Viewed

The diff for this file is too large to render. See raw diff