umangchaudhry commited on
Commit
cc2626e
Β·
verified Β·
1 Parent(s): fc421eb

Upload 20 files

Browse files
app.py CHANGED
@@ -1,336 +1,129 @@
1
  """
2
- Gradio interface for Survey Analysis Agent
3
- Host on Hugging Face Spaces
4
  """
5
 
6
  import os
7
- import gradio as gr
 
 
 
 
 
8
  from survey_agent import SurveyAnalysisAgent
9
- import uuid
10
- from datetime import datetime
11
 
12
- # Initialize agent (will be done once at startup)
13
- agent = None
14
- initialization_error = None
 
 
15
 
16
- def initialize_agent():
17
- """Initialize the agent with API keys from environment"""
18
- global agent, initialization_error
19
-
20
- try:
21
- openai_api_key = os.getenv("OPENAI_API_KEY")
22
- pinecone_api_key = os.getenv("PINECONE_API_KEY")
23
-
24
- if not openai_api_key:
25
- initialization_error = "❌ OPENAI_API_KEY not found. Please set it in Space Settings β†’ Repository Secrets."
26
- return False
27
-
28
- if not pinecone_api_key:
29
- initialization_error = "❌ PINECONE_API_KEY not found. Please set it in Space Settings β†’ Repository Secrets."
30
- return False
31
-
32
- # Check if vector store exists
33
- if not os.path.exists("./questionnaire_vectorstores"):
34
- initialization_error = "❌ Vector store directory not found. Please upload the questionnaire_vectorstores folder."
35
- return False
36
-
37
- agent = SurveyAnalysisAgent(
38
- openai_api_key=openai_api_key,
39
- pinecone_api_key=pinecone_api_key,
40
- verbose=False # Set to False for cleaner UI
41
- )
42
-
43
- return True
44
-
45
- except Exception as e:
46
- initialization_error = f"❌ Initialization error: {str(e)}"
47
- return False
48
 
 
 
49
 
50
- def chat_with_streaming(message, history, session_id):
51
- """
52
- Stream response for better UX
53
-
54
- Args:
55
- message: User's message
56
- history: Chat history (not used in streaming)
57
- session_id: Unique session identifier for conversation memory
58
 
59
- Yields:
60
- Partial responses as they become available
61
- """
62
- if initialization_error:
63
- yield initialization_error
64
- return
65
 
66
- if not agent:
67
- yield "⚠️ Agent not initialized. Please refresh the page."
68
- return
69
 
70
- if not message.strip():
71
- return
72
 
73
- try:
74
- # Show that we're processing with a distinctive format
75
- yield "⏳ **Processing your request...**\n\nπŸ€” Analyzing your question..."
76
-
77
- # Debug: Check if stream_query exists
78
- if not hasattr(agent, 'stream_query'):
79
- print("⚠️ WARNING: agent.stream_query() not found, falling back to regular query")
80
- yield agent.query(message, thread_id=session_id)
81
- return
82
-
83
- # Define the workflow stages
84
- stages = {
85
- "generate_research_brief": {"icon": "πŸ“‹", "text": "Planning research strategy", "step": 1},
86
- "execute_stage": {"icon": "πŸ“Š", "text": "Retrieving data from surveys", "step": 2},
87
- "extract_stage_context": {"icon": "πŸ”—", "text": "Processing retrieved data", "step": 3},
88
- "synthesize_response": {"icon": "✍️", "text": "Synthesizing answer", "step": 4}
89
- }
90
-
91
- total_steps = 4
92
-
93
- # Stream events from agent
94
- has_answer = False
95
- event_count = 0
96
- current_step = 0
97
-
98
- for event in agent.stream_query(message, thread_id=session_id):
99
- event_count += 1
100
- print(f"πŸ“‘ Stream event {event_count}: {list(event.keys()) if event else 'None'}")
101
-
102
- if not event:
103
- continue
104
-
105
- # Get current node
106
- node = list(event.keys())[0]
107
- print(f" Processing node: {node}")
108
-
109
- # Build progress display
110
- if node in stages:
111
- stage_info = stages[node]
112
- current_step = stage_info['step']
113
-
114
- # Calculate percentage
115
- percentage = int((current_step / total_steps) * 100)
116
-
117
- # Create a clean progress indicator
118
- progress_display = f"### ⏳ Processing your request... ({percentage}%)\n\n"
119
- progress_display += f"> **Current step:** {stage_info['icon']} {stage_info['text']}\n\n"
120
-
121
- yield progress_display
122
-
123
- # Check for final answer
124
- if node == "synthesize_response":
125
- # Get final answer
126
- state = event[node]
127
- final_answer = state.get("final_answer")
128
- if final_answer:
129
- print(f" Got final answer ({len(final_answer)} chars)")
130
- yield final_answer
131
- has_answer = True
132
- return
133
-
134
- print(f"πŸ“‘ Stream complete. Total events: {event_count}, Has answer: {has_answer}")
135
-
136
- # Fallback if streaming didn't provide answer
137
- if not has_answer:
138
- print("⚠️ No answer from streaming, using regular query")
139
- yield agent.query(message, thread_id=session_id)
140
-
141
- except Exception as e:
142
- error_msg = f"❌ Error processing query: {str(e)}"
143
- print(f"Error details: {e}")
144
- import traceback
145
- traceback.print_exc()
146
- yield error_msg
147
-
148
-
149
- def create_new_session():
150
- """Create a new session ID"""
151
- return str(uuid.uuid4())
152
 
153
 
154
- def get_available_surveys():
155
- """Get list of available surveys"""
156
- if initialization_error or not agent:
157
- return "Agent not initialized"
 
 
 
 
 
 
158
 
159
  try:
160
- surveys = agent.questionnaire_rag.get_available_survey_names()
161
- polls = agent.questionnaire_rag.get_available_polls()
 
 
162
 
163
- info = "### Available Surveys\n\n"
164
- info += f"**{', '.join(surveys)}**\n\n"
165
- info += "### Available Time Periods\n\n"
166
-
167
- # Group by year
168
- by_year = {}
169
- for poll in polls:
170
- year = poll['year']
171
- if year not in by_year:
172
- by_year[year] = []
173
- by_year[year].append(poll)
174
-
175
- for year in sorted(by_year.keys(), reverse=True):
176
- info += f"**{year}:**\n"
177
- for poll in sorted(by_year[year], key=lambda x: x['month']):
178
- info += f"- {poll['month']} ({poll['num_questions']} questions)\n"
179
- info += "\n"
180
-
181
- return info
182
  except Exception as e:
183
- return f"Error retrieving survey info: {str(e)}"
184
 
185
 
186
- # Initialize agent at startup
187
- print("πŸš€ Initializing Survey Analysis Agent...")
188
- init_success = initialize_agent()
189
 
190
- if init_success:
191
- print("βœ… Agent initialized successfully!")
192
- else:
193
- print(f"⚠️ Agent initialization failed: {initialization_error}")
 
194
 
195
-
196
- # Create Gradio interface with modern chat-first design
197
- with gr.Blocks(title="Survey Analysis Agent", theme=gr.themes.Soft()) as demo:
198
-
199
- # Session state
200
- session_id_state = gr.State(value=create_new_session())
201
 
202
- # Main layout: chat takes priority
203
- with gr.Row():
204
- with gr.Column(scale=3):
205
- # Header
206
- gr.Markdown("""
207
- # πŸ“Š Survey Analysis Agent
208
-
209
- Ask questions about Vanderbilt Unity Poll data using natural language.
210
- I can analyze questions, response frequencies, and demographic breakdowns across multiple time periods.
211
- """)
212
-
213
- # Show initialization status if there's an error
214
- if initialization_error:
215
- gr.Markdown(f"## ⚠️ Setup Required\n\n{initialization_error}")
216
-
217
- # Main chat interface
218
- chatbot = gr.Chatbot(
219
- label="",
220
- height=500,
221
- show_label=False,
222
- type="messages",
223
- placeholder="Ask me anything about the survey data..."
224
- )
225
-
226
- with gr.Row():
227
- msg = gr.Textbox(
228
- label="",
229
- placeholder="e.g., What questions about the economy were asked in June 2025?",
230
- show_label=False,
231
- scale=9,
232
- container=False
233
- )
234
- submit = gr.Button("Send", scale=1, variant="primary")
235
-
236
- with gr.Row():
237
- clear = gr.Button("πŸ”„ New Conversation", size="sm")
238
-
239
- # Example questions
240
- gr.Examples(
241
- examples=[
242
- "What questions were asked in June 2025?",
243
- "Show me Trump's approval ratings in 2025",
244
- "What questions about the economy were asked in 2025?",
245
- "How do responses about immigration vary by political party?",
246
- "Compare healthcare questions from February and June 2025",
247
- ],
248
- inputs=msg,
249
- label="πŸ’‘ Example Questions"
250
- )
251
-
252
- # Collapsible sidebar with info
253
- with gr.Column(scale=1):
254
- with gr.Accordion("πŸ“‹ Available Data", open=False):
255
- survey_info = gr.Markdown(
256
- value=get_available_surveys() if init_success else "Agent not initialized",
257
- )
258
- refresh_info = gr.Button("πŸ”„ Refresh", size="sm")
259
-
260
- with gr.Accordion("🎯 What I Can Do", open=False):
261
- gr.Markdown("""
262
- **πŸ“ Questionnaires**
263
- - Question text & options
264
- - Topics and themes
265
- - Skip logic & sampling
266
- - Question sequencing
267
-
268
- **πŸ“Š Response Data**
269
- - Overall percentages
270
- - Demographic breakdowns
271
- - Cross-tabulations
272
- - Time comparisons
273
- """)
274
-
275
- with gr.Accordion("πŸ’‘ Tips", open=False):
276
- gr.Markdown("""
277
- - Specify time periods when relevant
278
- - Ask follow-up questions for more detail
279
- - I maintain conversation context
280
- - Request comparisons across time periods
281
- """)
282
-
283
- with gr.Accordion("πŸ”§ Current Status", open=False):
284
- gr.Markdown("""
285
- βœ… Questionnaire data
286
- βœ… Toplines (response %)
287
- βœ… Crosstabs (demographics)
288
- ⏳ SQL queries (coming soon)
289
- """)
290
 
291
- # Footer
292
- gr.Markdown("""
293
- ---
294
- πŸ’¬ **Conversation Memory:** I remember our conversation history, so feel free to ask follow-up questions
295
- or reference previous queries (e.g., "Show me the crosstabs for those questions").
296
- """)
297
 
298
- # Event handlers
299
- def respond(message, chat_history, session_id):
300
- """Handle message with streaming updates"""
301
- if not message.strip():
302
- return chat_history, ""
303
-
304
- # Add user message
305
- chat_history.append({"role": "user", "content": message})
306
-
307
- # Add placeholder for assistant response
308
- chat_history.append({"role": "assistant", "content": ""})
309
-
310
- # Stream updates
311
- for partial_response in chat_with_streaming(message, chat_history, session_id):
312
- chat_history[-1]["content"] = partial_response
313
- yield chat_history, ""
314
-
315
- # Final return
316
- yield chat_history, ""
317
-
318
- def clear_chat():
319
- """Clear chat and create new session"""
320
- new_session = create_new_session()
321
- return [], new_session
322
-
323
- # Wire up events
324
- msg.submit(respond, [msg, chatbot, session_id_state], [chatbot, msg])
325
- submit.click(respond, [msg, chatbot, session_id_state], [chatbot, msg])
326
- clear.click(clear_chat, None, [chatbot, session_id_state])
327
- refresh_info.click(get_available_surveys, None, survey_info)
328
 
329
-
330
- # Launch the app
331
  if __name__ == "__main__":
 
 
 
 
 
 
 
 
 
 
332
  demo.launch(
333
- server_name="0.0.0.0",
334
  server_port=7860,
335
- share=False
 
336
  )
 
 
1
  """
2
+ Gradio ChatInterface for Survey Agent V2 - Simplified Version
3
+ Uses ChatInterface to avoid API generation bugs
4
  """
5
 
6
  import os
7
+ import sys
8
+ from pathlib import Path
9
+
10
+ # Add parent directory to path for imports
11
+ sys.path.append(str(Path(__file__).parent))
12
+
13
  from survey_agent import SurveyAnalysisAgent
 
 
14
 
15
+ try:
16
+ from dotenv import load_dotenv
17
+ load_dotenv()
18
+ except ImportError:
19
+ pass
20
 
21
+ import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
+ # Global agent
24
+ agent = None
25
 
26
+
27
+ def initialize_agent():
28
+ """Initialize the survey analysis agent"""
29
+ global agent
 
 
 
 
30
 
31
+ if agent is not None:
32
+ return agent
 
 
 
 
33
 
34
+ openai_api_key = os.getenv("OPENAI_API_KEY")
35
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
 
36
 
37
+ if not openai_api_key or not pinecone_api_key:
38
+ raise ValueError("Missing API keys")
39
 
40
+ print("Initializing Survey Analysis Agent...")
41
+ agent = SurveyAnalysisAgent(
42
+ openai_api_key=openai_api_key,
43
+ pinecone_api_key=pinecone_api_key,
44
+ verbose=True
45
+ )
46
+ print("βœ… Agent initialized!")
47
+ return agent
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
 
50
+ def respond(message, history):
51
+ """Process user message and return bot response"""
52
+ global agent
53
+
54
+ # Initialize agent if needed
55
+ if agent is None:
56
+ try:
57
+ agent = initialize_agent()
58
+ except Exception as e:
59
+ return f"⚠️ Error: {str(e)}"
60
 
61
  try:
62
+ # Use a default thread ID
63
+ thread_id = "gradio_session"
64
+ response = agent.query(message, thread_id=thread_id)
65
+ return response
66
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
  except Exception as e:
68
+ return f"❌ Error: {str(e)}\n\nPlease try rephrasing your question."
69
 
70
 
71
+ # Create the interface
72
+ print("Creating Gradio interface...")
 
73
 
74
+ # Create a custom chatbot with larger height
75
+ chatbot = gr.Chatbot(
76
+ height=650, # Increased height for better readability
77
+ show_copy_button=True, # Allow copying responses
78
+ )
79
 
80
+ demo = gr.ChatInterface(
81
+ respond,
82
+ chatbot=chatbot,
83
+ title="πŸ—³οΈ Vanderbilt Unity Poll Survey Agent",
84
+ description="""
85
+ ### AI-Powered Analysis of Survey Data
86
 
87
+ Ask questions about American public opinion using natural language.
88
+ The system will search through survey data and provide comprehensive answers.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
+ **Example questions:**
91
+ - What do Americans think about immigration in June 2025?
92
+ - How has Biden's approval rating changed over time?
93
+ - Show me views on the economy by political party
94
+ - Break that down by gender
 
95
 
96
+ **Available data:**
97
+ - 9 polls from 2023-2025
98
+ - 125 questions across topics like immigration, economy, healthcare, etc.
99
+ - Demographic breakdowns by party, gender, age, and more
100
+ """,
101
+ examples=[
102
+ "What do Americans think about immigration in June 2025?",
103
+ "How has Biden's approval rating changed?",
104
+ "Show me views on the economy by political party",
105
+ ],
106
+ theme=gr.themes.Soft(),
107
+ retry_btn=None,
108
+ undo_btn=None,
109
+ clear_btn="Clear Chat",
110
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
 
 
112
  if __name__ == "__main__":
113
+ print("\nLaunching Gradio interface...")
114
+ print("The interface will open at http://127.0.0.1:7860")
115
+ print("\nPress Ctrl+C to stop.\n")
116
+
117
+ # Pre-initialize the agent
118
+ try:
119
+ initialize_agent()
120
+ except Exception as e:
121
+ print(f"⚠️ Warning: {e}")
122
+
123
  demo.launch(
124
+ server_name="127.0.0.1",
125
  server_port=7860,
126
+ share=False,
127
+ show_error=True
128
  )
129
+
config.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Configuration constants for Survey Agent V2
3
+ """
4
+
5
+ # Valid topics that exist in the questionnaire vectorstore metadata
6
+ # These are the only topics that can be used for metadata filtering
7
+ VALID_TOPICS = {
8
+ "biden_administration",
9
+ "confidence_institutions",
10
+ "economy",
11
+ "education",
12
+ "elections",
13
+ "foreign_policy",
14
+ "general",
15
+ "healthcare",
16
+ "immigration",
17
+ "judicial",
18
+ "technology",
19
+ "trump_administration",
20
+ }
21
+
22
+ # Topic mapping for common variations/synonyms
23
+ TOPIC_MAPPINGS = {
24
+ # Immigration variations
25
+ "deportation": "immigration",
26
+ "deporting": "immigration",
27
+ "border": "immigration",
28
+ "visa": "immigration",
29
+ "visas": "immigration",
30
+ "undocumented": "immigration",
31
+ "illegal immigration": "immigration",
32
+
33
+ # Economy variations
34
+ "tariffs": "economy",
35
+ "tariff": "economy",
36
+ "finances": "economy",
37
+ "financial": "economy",
38
+ "stock market": "economy",
39
+ "inflation": "economy",
40
+
41
+ # Education variations
42
+ "college": "education",
43
+ "colleges": "education",
44
+ "university": "education",
45
+ "universities": "education",
46
+ "school": "education",
47
+ "schools": "education",
48
+
49
+ # Healthcare variations
50
+ "health": "healthcare",
51
+ "medical": "healthcare",
52
+ "wellness": "healthcare",
53
+
54
+ # Technology variations
55
+ "ai": "technology",
56
+ "artificial intelligence": "technology",
57
+ "innovation": "technology",
58
+
59
+ # Elections variations
60
+ "voting": "elections",
61
+ "vote": "elections",
62
+ "electoral": "elections",
63
+ "candidate": "elections",
64
+ "candidates": "elections",
65
+
66
+ # Trump variations
67
+ "trump": "trump_administration",
68
+ "maga": "trump_administration",
69
+
70
+ # Biden variations
71
+ "biden": "biden_administration",
72
+
73
+ # Judicial variations
74
+ "court": "judicial",
75
+ "courts": "judicial",
76
+ "judge": "judicial",
77
+ "judges": "judicial",
78
+ "ruling": "judicial",
79
+ "rulings": "judicial",
80
+
81
+ # Foreign policy variations
82
+ "china": "foreign_policy",
83
+ "international": "foreign_policy",
84
+ "foreign": "foreign_policy",
85
+
86
+ # Confidence variations
87
+ "confidence": "confidence_institutions",
88
+ "trust": "confidence_institutions",
89
+ "institutions": "confidence_institutions",
90
+ }
91
+
92
+
93
+ def normalize_topic(topic: str) -> str:
94
+ """
95
+ Normalize a topic string to a valid topic.
96
+
97
+ Args:
98
+ topic: The topic to normalize (case-insensitive)
99
+
100
+ Returns:
101
+ Normalized topic if valid/mappable, else 'general'
102
+ """
103
+ if not topic:
104
+ return "general"
105
+
106
+ topic_lower = topic.lower().strip()
107
+
108
+ # Check if it's already a valid topic
109
+ if topic_lower in VALID_TOPICS:
110
+ return topic_lower
111
+
112
+ # Check if it can be mapped
113
+ if topic_lower in TOPIC_MAPPINGS:
114
+ return TOPIC_MAPPINGS[topic_lower]
115
+
116
+ # Check for partial matches (e.g., "trump administration" β†’ "trump_administration")
117
+ for valid_topic in VALID_TOPICS:
118
+ if topic_lower.replace("_", " ") == valid_topic.replace("_", " "):
119
+ return valid_topic
120
+ if topic_lower in valid_topic or valid_topic in topic_lower:
121
+ return valid_topic
122
+
123
+ # If no match, return general (will use semantic search)
124
+ return "general"
125
+
126
+
127
+ def is_valid_topic(topic: str) -> bool:
128
+ """Check if a topic is valid for metadata filtering"""
129
+ return topic.lower().strip() in VALID_TOPICS
130
+
crosstab_rag.py CHANGED
@@ -1,155 +1,221 @@
1
- #!/usr/bin/env python3
2
  """
3
- rag_crosstab_query.py
4
-
5
- Full Crosstab RAG pipeline:
6
- - Parse user query for survey/year/month/topic
7
- - Use QuestionnaireRAG to find matching questions (reuses existing vectorstore)
8
- - Extract variable names from matched questions
9
- - Query Pinecone within the appropriate namespace (survey crosstabs namespace)
10
- - Collect all parts for the matched question(s)
11
- - Summarize with the LLM, cite source filenames/part ids
12
  """
13
 
14
  import os
15
- import re
16
- import argparse
17
  from typing import List, Dict, Optional, Any
18
  from pathlib import Path
19
 
20
  from dotenv import load_dotenv
21
-
22
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
23
  from langchain.schema import Document
24
  from langchain_pinecone import PineconeVectorStore
25
  from pinecone import Pinecone
26
 
27
- # Import QuestionnaireRAG to reuse existing question matching
28
- from questionnaire_rag import QuestionnaireRAG
29
-
30
  load_dotenv()
31
 
32
-
33
- def _load_prompt_file(filename: str) -> str:
34
- """Load a prompt file from the prompts directory"""
35
- prompt_dir = Path(__file__).parent / "prompts"
36
- prompt_path = prompt_dir / filename
37
- if not prompt_path.exists():
38
- raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
39
- return prompt_path.read_text(encoding="utf-8")
40
-
41
- # -------------------------
42
- # Config / Environment
43
- # -------------------------
44
- OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
45
- PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_CROSSTABS")
46
- PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME_CROSSTABS", "crosstab-index")
47
-
48
- if not OPENAI_API_KEY:
49
- raise ValueError("OPENAI_API_KEY environment variable not set")
50
- if not PINECONE_API_KEY:
51
- raise ValueError("PINECONE_API_KEY_CROSSTABS environment variable not set")
52
-
53
- EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
54
- LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
55
 
56
  PINECONE_RETRIEVE_K = 100
57
  MAX_CROSSTAB_CHUNKS = 50
58
 
59
- # -------------------------
60
- # Utilities
61
- # -------------------------
62
- def extract_year_month_poll(query: str) -> Dict[str, Optional[str]]:
63
- out = {"year": None, "month": None, "poll": None}
64
- q = query.lower()
65
- ym = re.search(r"\b(20\d{2})\b", q)
66
- if ym:
67
- out["year"] = ym.group(1)
68
- months = ["january","february","march","april","may","june",
69
- "july","august","september","october","november","december"]
70
- for m in months:
71
- if m in q:
72
- out["month"] = m.capitalize()
73
- break
74
- if not out["month"]:
75
- if any(word in q for word in ["recent", "latest", "current", "now"]):
76
- out["month"] = "June"
77
- if not out["year"]:
78
- out["year"] = "2025"
79
- if "vanderbilt" in q or "unity" in q:
80
- out["poll"] = "Vanderbilt_Unity_Poll"
81
- return out
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
 
84
- # -------------------------
85
- # Pinecone retrieval + assembly
86
- # -------------------------
87
  class CrosstabRetriever:
88
- def __init__(self,
89
- pinecone_api_key: str = PINECONE_API_KEY,
90
- index_name: str = PINECONE_INDEX_NAME,
91
- embed_model: str = EMBED_MODEL,
92
- openai_api_key: str = OPENAI_API_KEY,
93
- verbose: bool = False):
 
 
 
 
94
  self.pc = Pinecone(api_key=pinecone_api_key)
95
  self.index_name = index_name
96
  self.embedder = OpenAIEmbeddings(model=embed_model, openai_api_key=openai_api_key)
97
  self.verbose = verbose
98
 
99
- def _make_vectorstore(self, namespace: str) -> PineconeVectorStore:
100
- index = self.pc.Index(self.index_name)
101
- return PineconeVectorStore(index=index, embedding=self.embedder, namespace=namespace)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- def retrieve_parts_for_variable(self, namespace: str, variable_prefix: str, user_query: str = None, k: int = PINECONE_RETRIEVE_K) -> List[Document]:
 
 
 
 
 
104
  """
105
- Retrieve crosstab chunks for a specific variable using direct metadata filtering.
106
-
107
- Since we already know the exact variable name from QuestionnaireRAG, we use
108
- Pinecone metadata filtering instead of semantic search for better accuracy and speed.
109
 
110
  Args:
111
- namespace: Pinecone namespace (e.g., "Vanderbilt_Unity_Poll_2025_February_cleaned_data_crosstabs")
112
- variable_prefix: Exact variable name (e.g., "VAND15")
113
- user_query: Not used anymore, kept for backward compatibility
114
- k: Maximum number of chunks to retrieve (not really needed with exact filtering)
115
 
116
  Returns:
117
- List of Document objects with crosstab data for the variable
118
  """
119
  try:
120
  index = self.pc.Index(self.index_name)
121
  stats = index.describe_index_stats()
122
- namespaces = stats.get('namespaces', {})
123
- if namespace not in namespaces:
124
- return []
125
- except Exception:
126
- return []
127
-
128
- # Clean variable name - the CSV filename is like "VAND15_crosstab.csv"
129
- # So the variable_name stored is "VAND15_crosstab" (from csv_file.stem)
130
- # But QuestionnaireRAG returns "VAND15"
131
- # We need to match both formats
132
- base_variable = variable_prefix.replace("_crosstab", "").split("_")[0]
133
- variable_with_suffix = f"{base_variable}_crosstab"
134
-
135
- if self.verbose:
136
- print(f" πŸ” Looking for variable: '{base_variable}' or '{variable_with_suffix}' in namespace: '{namespace}'")
137
-
138
- # Use Pinecone metadata filtering for exact match
139
- # Try both formats: "VAND15" and "VAND15_crosstab"
140
- try:
141
- # Pinecone supports $or for multiple conditions
142
- filter_dict = {
143
- "$or": [
144
- {"variable_name": {"$eq": base_variable}},
145
- {"variable_name": {"$eq": variable_with_suffix}}
146
- ]
147
- }
148
 
149
- if self.verbose:
150
- print(f" πŸ”§ Filter: {filter_dict}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Get embedding dimension - we need a valid vector even for metadata-only queries
153
  embed_dim = 1536 # Default for text-embedding-3-small
154
  try:
155
  if hasattr(self.embedder, 'model') and 'small' in str(self.embedder.model).lower():
@@ -159,538 +225,306 @@ class CrosstabRetriever:
159
  except:
160
  pass
161
 
162
- # Use a dummy vector (all zeros is fine for metadata-filtered queries)
163
- # Pinecone requires a vector but with exact filters, ranking won't matter
164
  dummy_vector = [0.0] * embed_dim
 
165
 
166
- result = index.query(
167
- vector=dummy_vector,
168
- top_k=k,
169
- namespace=namespace,
170
- filter=filter_dict,
171
- include_metadata=True
172
- )
173
 
174
- if self.verbose:
175
- print(f" πŸ“Š Pinecone query returned {len(result.matches)} matches")
176
-
177
- docs = []
178
- for match in result.matches:
179
- metadata = match.metadata or {}
180
 
181
- # Debug: print what we found
182
  if self.verbose:
183
- found_var = metadata.get("variable_name", "N/A")
184
- found_qid = metadata.get("question_id", "N/A")
185
- print(f" πŸ“„ Found: variable_name='{found_var}', question_id='{found_qid}'")
 
 
 
186
 
187
- # Pinecone stores content differently depending on how it was uploaded
188
- # Try multiple ways to get the content
189
- content = None
190
 
191
- # Method 1: Check if there's a 'text' field in metadata (LangChain storage)
192
- if 'text' in metadata:
193
- content = metadata.pop('text', '')
194
- # Method 2: Check if content is in the match object itself
195
- elif hasattr(match, 'values') and match.values:
196
- # This shouldn't happen with metadata filtering, but just in case
197
- pass
198
- # Method 3: Try to reconstruct from metadata if available
199
- elif 'page_content' in metadata:
200
- content = metadata.pop('page_content', '')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
 
202
- # If we still don't have content, we can't use this document
203
- if not content:
 
 
 
 
 
 
 
204
  if self.verbose:
205
- print(f" ⚠️ No content found for match, skipping")
206
- continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
207
 
208
- docs.append(Document(page_content=content, metadata=metadata))
 
 
 
 
 
 
 
 
209
 
210
  if self.verbose:
211
- print(f" βœ… Successfully loaded {len(docs)} document(s)")
 
212
 
213
- # Sort by chunk_index to maintain order
214
- docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
215
- return docs[:MAX_CROSSTAB_CHUNKS]
216
 
217
  except Exception as e:
218
  if self.verbose:
219
- print(f" ❌ Error with metadata filter: {e}")
220
- # Fallback: if metadata filtering fails, try fetching sample documents to debug
221
- if self.verbose:
222
- print(f" πŸ”„ Falling back to manual filtering...")
223
- try:
224
- # Try to fetch a sample to see what's actually in the namespace
225
- # First, try fetching without filter to see what variable names exist
226
- sample_result = index.query(
227
- vector=[0.0] * 1536, # Dummy vector
228
- top_k=10, # Just get a few samples
229
- namespace=namespace,
230
- include_metadata=True
231
- )
232
-
233
- if self.verbose and sample_result.matches:
234
- print(f" πŸ“‹ Sample variables in namespace:")
235
- for sample in sample_result.matches[:5]:
236
- sample_meta = sample.metadata or {}
237
- sample_var = sample_meta.get("variable_name", "N/A")
238
- sample_qid = sample_meta.get("question_id", "N/A")
239
- print(f" - variable_name: '{sample_var}', question_id: '{sample_qid}'")
240
-
241
- # Now try to find matches manually
242
- result = index.query(
243
- vector=[0.0] * 1536, # Dummy vector
244
- top_k=k * 2, # Get more to filter from
245
- namespace=namespace,
246
- include_metadata=True
247
- )
248
- docs = []
249
- for match in result.matches:
250
- metadata = match.metadata or {}
251
- var_name = metadata.get("variable_name", "")
252
- question_id = metadata.get("question_id", "")
253
-
254
- # Check if this matches our variable (case-insensitive)
255
- # Try matching both "VAND15" and "VAND15_crosstab" formats
256
- var_match = (base_variable.lower() == var_name.lower() or
257
- variable_with_suffix.lower() == var_name.lower() or
258
- question_id.lower().startswith(base_variable.lower() + "_") or
259
- question_id.lower().startswith(base_variable.lower()))
260
-
261
- if var_match:
262
- # Try to get content
263
- content = metadata.pop('text', '') or metadata.pop('page_content', '') or ''
264
- if content:
265
- docs.append(Document(page_content=content, metadata=metadata))
266
- elif self.verbose:
267
- print(f" ⚠️ Matched variable '{var_name}' but no content found")
268
-
269
- docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
270
- if self.verbose:
271
- print(f" βœ… Fallback found {len(docs)} document(s)")
272
- return docs[:MAX_CROSSTAB_CHUNKS]
273
- except Exception as fallback_error:
274
- if self.verbose:
275
- print(f" ❌ Fallback also failed: {fallback_error}")
276
- return []
277
 
278
- # -------------------------
279
- # LLM summarizer
280
- # -------------------------
281
- class CrosstabSummarizer:
282
- def __init__(self, llm_model: str = LLM_MODEL, openai_api_key: str = OPENAI_API_KEY):
283
- self.llm = ChatOpenAI(model=llm_model, openai_api_key=openai_api_key, temperature=0.0)
284
 
285
- def summarize(self, user_query: str, retrieved_docs: List[Document], question_text: Optional[str] = None, top_n_sources: int = 6) -> Dict:
286
- if not retrieved_docs:
287
- return {"answer": "No relevant crosstab data found for that query.", "sources": []}
288
- context_parts, sources = [], []
289
- for i, d in enumerate(retrieved_docs):
290
- md = d.metadata or {}
291
- id_hint = md.get("question_id") or md.get("variable_name") or f"part_{i+1}"
292
- content = d.page_content or ""
293
- context_parts.append(f"--- Part {i+1} | {id_hint} ---\n{content}")
294
- sources.append(id_hint)
295
- context_text = "\n\n".join(context_parts)
296
-
297
- # Load prompts from files
298
- system_prompt = _load_prompt_file("crosstab_rag_prompt_system.txt")
299
-
300
- question_context = f"\n\nSURVEY QUESTION THAT WAS RETRIEVED: {question_text}" if question_text else ""
301
- relevance_check = (
302
- "\n\n⚠️ FIRST: Check if the retrieved question above is actually relevant to the user's question. "
303
- "If it's about a different topic (e.g., user asked about 'economy' but question is about 'unity' or 'politics'), "
304
- "you MUST state this clearly and NOT provide detailed analysis of irrelevant data."
305
- ) if question_text else ""
306
-
307
- user_prompt_template = _load_prompt_file("crosstab_rag_prompt_user.txt")
308
- user_prompt = user_prompt_template.format(
309
- user_query=user_query,
310
- question_context=question_context,
311
- relevance_check=relevance_check,
312
- context_text=context_text
313
- )
314
- from langchain.schema import HumanMessage, SystemMessage
315
- messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
316
- try:
317
- result = self.llm.invoke(messages)
318
- answer = result.content if hasattr(result, 'content') else str(result)
319
- except Exception as e:
320
- answer = f"Error generating summary: {e}"
321
- return {"answer": answer.strip(), "sources": sources[:top_n_sources]}
322
-
323
- # -------------------------
324
- # Orchestration - full pipeline
325
- # -------------------------
326
  class CrosstabsRAG:
327
- def __init__(self, questionnaire_rag: QuestionnaireRAG, verbose: bool = False):
328
- """
329
- Initialize CrosstabsRAG.
330
-
331
- Args:
332
- questionnaire_rag: Initialized QuestionnaireRAG instance to reuse for question matching
333
- verbose: Whether to print detailed logging
334
- """
335
  self.questionnaire_rag = questionnaire_rag
336
  self.verbose = verbose
337
- self.retriever = CrosstabRetriever(verbose=verbose)
338
- self.summarizer = CrosstabSummarizer()
 
 
 
 
 
 
 
 
 
 
 
339
 
340
- def query(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
 
 
 
 
 
 
341
  """
342
- Query the crosstab system. Extracts poll, year, and month from the query.
343
- Uses QuestionnaireRAG to find matching questions, then retrieves crosstab data.
 
 
344
 
345
  Args:
346
- user_query: The question to answer
347
- filters: Optional filters dict (may include topic, year, month, survey_name)
 
 
348
 
349
  Returns:
350
- Dict with answer, sources, and metadata
351
  """
352
- # Extract year, month, poll from query
353
- hints = extract_year_month_poll(user_query)
354
- year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
355
-
356
- # If missing required info, try to get from filters
357
- if not year and filters and "year" in filters:
358
- year = str(filters["year"])
359
- if not month and filters and "month" in filters:
360
- month = filters["month"]
361
- if not poll and filters and "survey_name" in filters:
362
- poll = "Vanderbilt_Unity_Poll" # Default mapping
363
-
364
- # If still missing required info, return error instead of prompting
365
- if not all([poll, year, month]):
366
- missing = []
367
- if not poll: missing.append("poll/survey name")
368
- if not year: missing.append("year")
369
- if not month: missing.append("month")
370
- return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
371
-
372
- # Build filters for QuestionnaireRAG
373
- q_filters = {
374
- "year": int(year),
375
- "month": month,
376
- "survey_name": "Vanderbilt Unity Poll" # Map from poll variable if needed
377
- }
378
-
379
- # Add topic filter if provided
380
- if filters:
381
- if self.verbose:
382
- print(f" πŸ“₯ Received filters: {filters}")
383
- if "topic" in filters and filters["topic"]:
384
- q_filters["topic"] = filters["topic"]
385
- if self.verbose:
386
- print(f" πŸ“Œ Added topic filter: {filters['topic']}")
387
- elif self.verbose and "topic" not in filters:
388
- print(f" ⚠️ No 'topic' key in filters dict")
389
- elif self.verbose:
390
- print(f" ⚠️ Topic filter is empty/None: {filters.get('topic')}")
391
- elif self.verbose:
392
- print(f" ⚠️ No filters dict provided to CrosstabsRAG.query()")
393
-
394
- # Enhance query text to emphasize topic if provided
395
- enhanced_query = user_query
396
- if filters and "topic" in filters:
397
- topic = filters["topic"]
398
- # Make sure topic is mentioned prominently in the query
399
- if topic.lower() not in enhanced_query.lower():
400
- enhanced_query = f"{topic} {enhanced_query}"
401
-
402
- # Use QuestionnaireRAG to find matching questions
403
  if self.verbose:
404
- print(f"πŸ” [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore")
405
- print(f" Query: {enhanced_query}")
406
- print(f" Filters being passed: {q_filters}")
407
-
408
- try:
409
- q_result = self.questionnaire_rag.query_with_metadata(
410
- question=enhanced_query,
411
- filters=q_filters,
412
- k=10 # Get more matches to capture all economy questions
413
- )
414
- except Exception as e:
415
- return {"error": f"Error querying questionnaire: {e}"}
416
-
417
- source_questions = q_result.get("source_questions", [])
418
- if not source_questions:
419
- return {"error": "No matching questions found in questionnaire for that query."}
420
-
421
- if self.verbose:
422
- print(f"βœ… [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
423
- for i, q in enumerate(source_questions[:3], 1):
424
- var = q.get("variable_name", "unknown")
425
- qtext = q.get("question_text", "")[:80]
426
- print(f" {i}. {var}: {qtext}...")
427
-
428
- # Build namespace for crosstab retrieval
429
- namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
430
-
431
- # Process ALL matched questions (not just the first one)
432
- all_question_answers = []
433
- all_sources = []
434
- matched_variables = []
435
-
436
- for matched_question in source_questions:
437
- variable_name = matched_question["variable_name"]
438
- question_text = matched_question["question_text"]
439
-
440
  if self.verbose:
441
- print(f"\nπŸ” [CrosstabRAG] Step 2: Processing {variable_name}")
442
- print(f" Namespace: {namespace}")
443
- print(f" Variable: {variable_name}")
444
 
445
- # Retrieve crosstab chunks for this specific variable
446
- crosstab_docs = self.retriever.retrieve_parts_for_variable(
447
- namespace=namespace,
448
- variable_prefix=variable_name,
449
- user_query=user_query,
450
- k=PINECONE_RETRIEVE_K
451
  )
452
 
453
- if not crosstab_docs:
454
- if self.verbose:
455
- print(f" ⚠️ No crosstab data found for {variable_name}")
456
- continue
457
 
458
- if self.verbose:
459
- print(f" βœ… Retrieved {len(crosstab_docs)} crosstab chunk(s)")
460
- chunk_ids = [d.metadata.get("question_id", d.metadata.get("variable_name", "unknown")) for d in crosstab_docs[:3]]
461
- print(f" Chunk IDs: {', '.join(chunk_ids)}{' ...' if len(crosstab_docs) > 3 else ''}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
- # Summarize this question's crosstab data
464
- summary = self.summarizer.summarize(
465
- user_query=user_query,
466
- retrieved_docs=crosstab_docs,
467
- question_text=question_text,
468
- top_n_sources=6
469
- )
470
 
471
- # Add question identifier to the answer
472
- question_header = f"\n\n--- Question: {variable_name} ---\n{question_text}\n"
473
- question_answer = question_header + summary["answer"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
474
 
475
- all_question_answers.append(question_answer)
476
- all_sources.extend(summary["sources"])
477
- matched_variables.append(variable_name)
478
-
479
- if not all_question_answers:
480
- return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
481
-
482
- if self.verbose:
483
- print(f"\nπŸ” [CrosstabRAG] Step 3: Combining {len(all_question_answers)} question(s)")
484
-
485
- # Combine all question answers into a single comprehensive answer
486
- combined_answer = "\n\n".join(all_question_answers)
487
-
488
- # Add overall citation block
489
- citation_block = (
490
- f"\n\n---\nSource: {poll.replace('_', ' ')}, {month} {year}\n"
491
- f"Questions analyzed: {', '.join(matched_variables)}\n"
492
- f"Total questions: {len(matched_variables)}\n"
493
- )
494
- combined_answer = combined_answer + citation_block
495
-
496
- return {
497
- "answer": combined_answer,
498
- "sources": list(set(all_sources)), # Deduplicate sources
499
- "matched_variable": matched_variables[0] if len(matched_variables) == 1 else f"{len(matched_variables)} questions",
500
- "matched_variables": matched_variables, # Add all matched variables
501
- "matched_question": source_questions[0]["question_text"] if source_questions else "",
502
- "namespace_used": namespace,
503
- "survey_info": {"poll": poll, "year": year, "month": month}
504
- }
505
-
506
- def retrieve_raw_data(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
507
- """
508
- Retrieve raw data without LLM summarization.
509
- Used by agent framework to get raw data for synthesis.
510
-
511
- Args:
512
- user_query: The question to answer
513
- filters: Optional filters dict (may include topic, year, month, survey_name)
514
-
515
- Returns:
516
- Dict with crosstab_docs_by_variable, matched_questions, namespace_used, survey_info
517
- """
518
- # Extract year, month, poll from query
519
- hints = extract_year_month_poll(user_query)
520
- year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
521
-
522
- # If missing required info, try to get from filters
523
- if not year and filters and "year" in filters:
524
- year = str(filters["year"])
525
- if not month and filters and "month" in filters:
526
- month = filters["month"]
527
- if not poll and filters and "survey_name" in filters:
528
- poll = "Vanderbilt_Unity_Poll" # Default mapping
529
-
530
- # If still missing required info, return error instead of prompting
531
- if not all([poll, year, month]):
532
- missing = []
533
- if not poll: missing.append("poll/survey name")
534
- if not year: missing.append("year")
535
- if not month: missing.append("month")
536
- return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
537
-
538
- # Build filters for QuestionnaireRAG
539
- q_filters = {
540
- "year": int(year),
541
- "month": month,
542
- "survey_name": "Vanderbilt Unity Poll" # Map from poll variable if needed
543
- }
544
-
545
- # Add topic filter if provided
546
- if filters:
547
- if self.verbose:
548
- print(f" πŸ“₯ Received filters: {filters}")
549
- if "topic" in filters and filters["topic"]:
550
- q_filters["topic"] = filters["topic"]
551
- if self.verbose:
552
- print(f" πŸ“Œ Added topic filter: {filters['topic']}")
553
-
554
- # Enhance query text to emphasize topic if provided
555
- enhanced_query = user_query
556
- if filters and "topic" in filters:
557
- topic = filters["topic"]
558
- # Make sure topic is mentioned prominently in the query
559
- if topic.lower() not in enhanced_query.lower():
560
- enhanced_query = f"{topic} {enhanced_query}"
561
 
562
- # Use QuestionnaireRAG to find matching questions
563
  if self.verbose:
564
- print(f"πŸ” [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore (raw data)")
565
- print(f" Query: {enhanced_query}")
566
- print(f" Filters being passed: {q_filters}")
567
 
568
  try:
569
  q_result = self.questionnaire_rag.retrieve_raw_data(
570
- question=enhanced_query,
571
- filters=q_filters,
572
- k=10 # Get more matches to capture all questions
573
  )
574
  except Exception as e:
575
  return {"error": f"Error querying questionnaire: {e}"}
576
 
577
  source_questions = q_result.get("source_questions", [])
 
 
578
  if not source_questions:
579
  return {"error": "No matching questions found in questionnaire for that query."}
580
 
581
  if self.verbose:
582
- print(f"βœ… [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
583
- for i, q in enumerate(source_questions[:3], 1):
584
- var = q.get("variable_name", "unknown")
585
- qtext = q.get("question_text", "")[:80]
586
- print(f" {i}. {var}: {qtext}...")
587
 
588
- # Build namespace for crosstab retrieval
589
- namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
 
 
 
590
 
591
- # Process ALL matched questions and collect raw crosstab documents
592
- crosstab_docs_by_variable = {}
 
 
 
593
  matched_variables = []
 
594
 
595
  for matched_question in source_questions:
596
  variable_name = matched_question["variable_name"]
597
  question_text = matched_question["question_text"]
598
 
599
- if self.verbose:
600
- print(f"\nπŸ” [CrosstabRAG] Step 2: Processing {variable_name} (raw data)")
601
- print(f" Namespace: {namespace}")
602
- print(f" Variable: {variable_name}")
603
-
604
- # Retrieve crosstab chunks for this specific variable
605
- crosstab_docs = self.retriever.retrieve_parts_for_variable(
606
- namespace=namespace,
607
- variable_prefix=variable_name,
608
- user_query=user_query,
609
- k=PINECONE_RETRIEVE_K
610
- )
611
-
612
- if not crosstab_docs:
613
- if self.verbose:
614
- print(f" ⚠️ No crosstab data found for {variable_name}")
615
- continue
616
-
617
- if self.verbose:
618
- print(f" βœ… Retrieved {len(crosstab_docs)} crosstab chunk(s)")
619
-
620
- # Store raw documents without summarization
621
- crosstab_docs_by_variable[variable_name] = {
622
- "crosstab_docs": crosstab_docs,
623
- "question_text": question_text,
624
- "matched_question": matched_question
625
- }
626
- matched_variables.append(variable_name)
627
-
628
- if not crosstab_docs_by_variable:
629
- return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
630
-
631
- if self.verbose:
632
- print(f"\nβœ… [CrosstabRAG] Step 2 Complete: Retrieved raw data for {len(matched_variables)} question(s)")
633
 
634
  return {
635
- "crosstab_docs_by_variable": crosstab_docs_by_variable,
636
  "matched_questions": source_questions,
637
  "matched_variables": matched_variables,
638
- "namespace_used": namespace,
639
- "survey_info": {"poll": poll, "year": year, "month": month}
640
  }
641
 
642
- # -------------------------
643
- # CLI / Interactive
644
- # -------------------------
645
- def main():
646
- parser = argparse.ArgumentParser(description="Crosstab RAG CLI - query survey crosstabs.")
647
- parser.add_argument("--query", "-q", help="Question to ask (if omitted, interactive).", default=None)
648
- args = parser.parse_args()
649
-
650
- # Initialize QuestionnaireRAG first (needed for CrosstabsRAG)
651
- openai_api_key = os.getenv("OPENAI_API_KEY")
652
- pinecone_api_key = os.getenv("PINECONE_API_KEY")
653
-
654
- if not openai_api_key or not pinecone_api_key:
655
- print("Error: Missing API keys")
656
- print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
657
- return
658
-
659
- questionnaire_rag = QuestionnaireRAG(
660
- openai_api_key=openai_api_key,
661
- pinecone_api_key=pinecone_api_key,
662
- persist_directory="./questionnaire_vectorstores",
663
- verbose=False
664
- )
665
-
666
- system = CrosstabsRAG(questionnaire_rag=questionnaire_rag)
667
-
668
- if args.query:
669
- out = system.query(args.query)
670
- if "error" in out:
671
- print(f"Error: {out['error']}")
672
- else:
673
- matched_question = out.get("matched_question", "")
674
- if matched_question:
675
- print(f"\nSURVEY QUESTION:\n{matched_question}\n")
676
- print("ANSWER:\n", out["answer"])
677
- else:
678
- print("Interactive Crosstab RAG\nType 'quit' to stop.")
679
- while True:
680
- try:
681
- q = input("\nYour question: ").strip()
682
- if not q or q.lower() in ("quit","exit"):
683
- break
684
- out = system.query(q)
685
- if "error" in out:
686
- print(f"Error: {out['error']}")
687
- continue
688
- matched_question = out.get("matched_question", "")
689
- if matched_question:
690
- print(f"\nSURVEY QUESTION:\n{matched_question}\n")
691
- print("ANSWER:\n", out["answer"])
692
- except KeyboardInterrupt:
693
- break
694
-
695
- if __name__ == "__main__":
696
- main()
 
 
1
  """
2
+ Crosstab RAG Module
3
+ ------------------
4
+ Retrieves crosstab demographic breakdown data from Pinecone vectorstore.
5
+ Uses question_info for precise namespace matching and metadata filtering.
6
+ Returns raw data only - no synthesis.
 
 
 
 
7
  """
8
 
9
  import os
 
 
10
  from typing import List, Dict, Optional, Any
11
  from pathlib import Path
12
 
13
  from dotenv import load_dotenv
14
+ from langchain_openai import OpenAIEmbeddings
 
15
  from langchain.schema import Document
16
  from langchain_pinecone import PineconeVectorStore
17
  from pinecone import Pinecone
18
 
 
 
 
19
  load_dotenv()
20
 
21
+ # Import QuestionnaireRAG to reuse question matching when needed
22
+ try:
23
+ from questionnaire_rag import QuestionnaireRAG
24
+ except ImportError:
25
+ # Handle case where running as module
26
+ from .questionnaire_rag import QuestionnaireRAG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
27
 
28
  PINECONE_RETRIEVE_K = 100
29
  MAX_CROSSTAB_CHUNKS = 50
30
 
31
+
32
+ class CrosstabSummarizer:
33
+ """Summarizes crosstab data to reduce token usage."""
34
+
35
+ def __init__(self, llm_model: str = None, openai_api_key: str = None):
36
+ from langchain_openai import ChatOpenAI
37
+ llm_model = llm_model or os.getenv("OPENAI_MODEL", "gpt-4o")
38
+ openai_api_key = openai_api_key or os.getenv("OPENAI_API_KEY")
39
+ self.llm = ChatOpenAI(model=llm_model, openai_api_key=openai_api_key, temperature=0.0)
40
+
41
+ def summarize(
42
+ self,
43
+ user_query: str,
44
+ retrieved_docs: List[Document],
45
+ question_text: Optional[str] = None,
46
+ top_n_sources: int = 6
47
+ ) -> Dict:
48
+ """Summarize crosstab data, extracting relevant demographic breakdowns."""
49
+ if not retrieved_docs:
50
+ return {"answer": "No relevant crosstab data found for that query.", "sources": []}
51
+
52
+ context_parts, sources = [], []
53
+ for i, d in enumerate(retrieved_docs):
54
+ # Handle both Document objects and dicts (from checkpoint deserialization)
55
+ if hasattr(d, 'metadata'):
56
+ md = d.metadata or {}
57
+ content = d.page_content or ""
58
+ elif isinstance(d, dict):
59
+ md = d.get("metadata", {})
60
+ content = d.get("page_content", "")
61
+ else:
62
+ md = {}
63
+ content = ""
64
+
65
+ id_hint = md.get("question_id") or md.get("variable_name") or f"part_{i+1}"
66
+ context_parts.append(f"--- Part {i+1} | {id_hint} ---\n{content}")
67
+ sources.append(id_hint)
68
+ context_text = "\n\n".join(context_parts)
69
+
70
+ # Load prompts
71
+ prompt_dir = Path(__file__).parent / "prompts"
72
+ system_prompt_path = prompt_dir / "crosstab_rag_prompt_system.txt"
73
+ user_prompt_path = prompt_dir / "crosstab_rag_prompt_user.txt"
74
+
75
+ system_prompt = system_prompt_path.read_text(encoding="utf-8") if system_prompt_path.exists() else ""
76
+
77
+ question_context = f"\n\nSURVEY QUESTION THAT WAS RETRIEVED: {question_text}" if question_text else ""
78
+ relevance_check = (
79
+ "\n\n⚠️ RELEVANCE: The retrieved question IS relevant to the user's query. "
80
+ "Remember: ALL subtopics, specific examples, and related aspects ARE relevant:\n"
81
+ "- 'personal financial situation' IS about economy\n"
82
+ "- 'tariffs' IS about economy\n"
83
+ "- 'stock market' IS about economy\n"
84
+ "- 'gender-affirming healthcare' IS about healthcare\n"
85
+ "- 'Biden approval' IS about presidential approval\n"
86
+ "Only flag as irrelevant if about a COMPLETELY UNRELATED topic (e.g., user asked 'economy' but question is about 'sports teams'). "
87
+ "When in doubt, ANALYZE THE DATA - do not reject it."
88
+ ) if question_text else ""
89
+
90
+ user_prompt_template = user_prompt_path.read_text(encoding="utf-8") if user_prompt_path.exists() else "{user_query}\n\n{context_text}"
91
+ user_prompt = user_prompt_template.format(
92
+ user_query=user_query,
93
+ question_context=question_context,
94
+ relevance_check=relevance_check,
95
+ context_text=context_text
96
+ )
97
+
98
+ from langchain.schema import HumanMessage, SystemMessage
99
+ messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
100
+ try:
101
+ result = self.llm.invoke(messages)
102
+ answer = result.content if hasattr(result, 'content') else str(result)
103
+ except Exception as e:
104
+ answer = f"Error generating summary: {e}"
105
+ return {"answer": answer.strip(), "sources": sources[:top_n_sources]}
106
 
107
 
 
 
 
108
  class CrosstabRetriever:
109
+ """Retrieves crosstab chunks from Pinecone using metadata filtering."""
110
+
111
+ def __init__(
112
+ self,
113
+ pinecone_api_key: str,
114
+ index_name: str,
115
+ embed_model: str,
116
+ openai_api_key: str,
117
+ verbose: bool = False
118
+ ):
119
  self.pc = Pinecone(api_key=pinecone_api_key)
120
  self.index_name = index_name
121
  self.embedder = OpenAIEmbeddings(model=embed_model, openai_api_key=openai_api_key)
122
  self.verbose = verbose
123
 
124
+ def _build_namespace_from_question_info(self, question_info: Dict[str, Any]) -> Optional[str]:
125
+ """Build namespace from question_info (year + month)"""
126
+ year = question_info.get("year")
127
+ month = question_info.get("month", "")
128
+
129
+ if year and month:
130
+ return f"Vanderbilt_Unity_Poll_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
131
+
132
+ # Try to extract from poll_date
133
+ poll_date = question_info.get("poll_date", "")
134
+ if poll_date:
135
+ try:
136
+ from datetime import datetime
137
+ # Handle format like "2025-June"
138
+ if "-" in poll_date and len(poll_date.split("-")) == 2:
139
+ year_str, month_str = poll_date.split("-")
140
+ return f"Vanderbilt_Unity_Poll_{year_str}_{month_str}_cleaned_data_crosstabs".replace(" ", "_")
141
+ else:
142
+ date_obj = datetime.strptime(poll_date, "%Y-%m-%d")
143
+ year_str = str(date_obj.year)
144
+ month_str = date_obj.strftime("%B")
145
+ return f"Vanderbilt_Unity_Poll_{year_str}_{month_str}_cleaned_data_crosstabs".replace(" ", "_")
146
+ except Exception as e:
147
+ if self.verbose:
148
+ print(f" ⚠️ Failed to parse poll_date '{poll_date}': {e}")
149
+
150
+ return None
151
 
152
+ def retrieve_parts_for_question_info(
153
+ self,
154
+ question_info_list: List[Dict[str, Any]],
155
+ k: int = PINECONE_RETRIEVE_K,
156
+ filters: Optional[Dict[str, Any]] = None
157
+ ) -> Dict[str, List[Document]]:
158
  """
159
+ Retrieve crosstab chunks for question_info list.
160
+ Groups by namespace (year/month) and filters by variable_name and question_id.
 
 
161
 
162
  Args:
163
+ question_info_list: List of question info dicts with variable_name, year, month, question_id
164
+ k: Number of results to retrieve per variable
165
+ filters: Optional filters with year/month to constrain namespace search
 
166
 
167
  Returns:
168
+ Dict mapping variable_name to list of Document objects
169
  """
170
  try:
171
  index = self.pc.Index(self.index_name)
172
  stats = index.describe_index_stats()
173
+ available_namespaces = list(stats.get('namespaces', {}).keys())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
+ if not available_namespaces:
176
+ if self.verbose:
177
+ print(" ⚠️ No namespaces found in index")
178
+ return {}
179
+
180
+ # Build target namespace from filters if provided
181
+ target_namespace = None
182
+ if filters:
183
+ year = filters.get("year")
184
+ month = filters.get("month", "")
185
+ if year and month:
186
+ target_namespace = f"Vanderbilt_Unity_Poll_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
187
+ if target_namespace not in available_namespaces:
188
+ if self.verbose:
189
+ print(f" ⚠️ Target namespace {target_namespace} not found in available namespaces")
190
+ target_namespace = None
191
+
192
+ # Group questions by namespace
193
+ questions_by_namespace = {}
194
+ for q_info in question_info_list:
195
+ var_name = q_info.get("variable_name")
196
+ if not var_name:
197
+ continue
198
+
199
+ # Try to build namespace from question_info first
200
+ namespace = self._build_namespace_from_question_info(q_info)
201
+ if namespace and namespace in available_namespaces:
202
+ if namespace not in questions_by_namespace:
203
+ questions_by_namespace[namespace] = []
204
+ questions_by_namespace[namespace].append(var_name)
205
+ elif target_namespace:
206
+ # Use target namespace from filters
207
+ if target_namespace not in questions_by_namespace:
208
+ questions_by_namespace[target_namespace] = []
209
+ questions_by_namespace[target_namespace].append(var_name)
210
+ else:
211
+ # Only search all namespaces if NO question metadata is available
212
+ # This prevents broad searches when question_info is provided
213
+ if self.verbose:
214
+ print(f" ⚠️ Could not determine namespace for {var_name} (year={q_info.get('year')}, month={q_info.get('month')})")
215
+ # Skip this question rather than searching all namespaces
216
+ continue
217
 
218
+ # Get embedding dimension
219
  embed_dim = 1536 # Default for text-embedding-3-small
220
  try:
221
  if hasattr(self.embedder, 'model') and 'small' in str(self.embedder.model).lower():
 
225
  except:
226
  pass
227
 
 
 
228
  dummy_vector = [0.0] * embed_dim
229
+ all_docs_by_variable = {}
230
 
231
+ # Build mapping from variable_name to question_id for filtering
232
+ var_to_question_id = {}
233
+ for q_info in question_info_list:
234
+ var_name = q_info.get("variable_name")
235
+ question_id = q_info.get("question_id")
236
+ if var_name and question_id:
237
+ var_to_question_id[var_name] = question_id
238
 
239
+ # Search each namespace
240
+ for namespace, var_names in questions_by_namespace.items():
241
+ if namespace not in available_namespaces:
242
+ continue
 
 
243
 
 
244
  if self.verbose:
245
+ print(f" πŸ” Searching namespace: {namespace}")
246
+ print(f" Looking for variables: {', '.join(sorted(set(var_names)))}")
247
+ if var_to_question_id:
248
+ matched_vars = [v for v in var_names if v in var_to_question_id]
249
+ if matched_vars:
250
+ print(f" πŸ”‘ Using question_id filter for: {', '.join(sorted(set(matched_vars)))}")
251
 
252
+ # Build filter for variable names and question IDs
253
+ unique_vars = list(set(var_names))
 
254
 
255
+ # Build filter conditions - match on either variable_name OR question_id
256
+ filter_conditions = []
257
+ for var in unique_vars:
258
+ var_conditions = []
259
+
260
+ # Add variable_name conditions (with and without _crosstab suffix)
261
+ var_conditions.append({"variable_name": {"$eq": var}})
262
+ var_conditions.append({"variable_name": {"$eq": f"{var}_crosstab"}})
263
+
264
+ # Add question_id condition if available
265
+ # Note: question_id in Pinecone metadata might have _part suffix for chunked crosstabs
266
+ # but we match on base question_id and filter in post-processing
267
+ if var in var_to_question_id:
268
+ question_id = var_to_question_id[var]
269
+ var_conditions.append({"question_id": {"$eq": question_id}})
270
+
271
+ # Combine conditions for this variable with $or
272
+ if len(var_conditions) > 1:
273
+ filter_conditions.append({"$or": var_conditions})
274
+ else:
275
+ filter_conditions.append(var_conditions[0])
276
+
277
+ # Combine all variable filters with $or
278
+ if len(filter_conditions) == 1:
279
+ var_filter = filter_conditions[0]
280
+ else:
281
+ var_filter = {"$or": filter_conditions}
282
 
283
+ try:
284
+ result = index.query(
285
+ vector=dummy_vector,
286
+ top_k=k * len(unique_vars),
287
+ namespace=namespace,
288
+ filter=var_filter,
289
+ include_metadata=True
290
+ )
291
+
292
  if self.verbose:
293
+ print(f" πŸ“Š Found {len(result.matches)} matches in {namespace}")
294
+
295
+ for match in result.matches:
296
+ metadata = match.metadata or {}
297
+ var_name = metadata.get("variable_name", "")
298
+
299
+ # Handle question_id format like "VAND10_part1"
300
+ question_id = metadata.get("question_id", "")
301
+ if question_id and "_part" in question_id:
302
+ base_var = question_id.split("_part")[0].replace("_crosstab", "")
303
+ if base_var in unique_vars:
304
+ var_name = base_var
305
+
306
+ # Check if variable_name has _crosstab suffix
307
+ if var_name and var_name.endswith("_crosstab"):
308
+ base_var = var_name.replace("_crosstab", "")
309
+ if base_var in unique_vars:
310
+ var_name = base_var
311
+
312
+ if not var_name or var_name not in unique_vars:
313
+ continue
314
+
315
+ content = metadata.pop('text', '') or metadata.pop('page_content', '') or ''
316
+ if not content:
317
+ continue
318
+
319
+ if var_name not in all_docs_by_variable:
320
+ all_docs_by_variable[var_name] = []
321
+
322
+ all_docs_by_variable[var_name].append(
323
+ Document(page_content=content, metadata=metadata)
324
+ )
325
 
326
+ except Exception as e:
327
+ if self.verbose:
328
+ print(f" ⚠️ Error querying namespace {namespace}: {e}")
329
+ continue
330
+
331
+ # Sort documents by chunk_index
332
+ for var_name in all_docs_by_variable:
333
+ all_docs_by_variable[var_name].sort(key=lambda d: d.metadata.get("chunk_index", 999))
334
+ all_docs_by_variable[var_name] = all_docs_by_variable[var_name][:MAX_CROSSTAB_CHUNKS]
335
 
336
  if self.verbose:
337
+ total_docs = sum(len(docs) for docs in all_docs_by_variable.values())
338
+ print(f" βœ… Retrieved {total_docs} total document(s) for {len(all_docs_by_variable)} variable(s)")
339
 
340
+ return all_docs_by_variable
 
 
341
 
342
  except Exception as e:
343
  if self.verbose:
344
+ print(f" ❌ Error in retrieve_parts_for_question_info: {e}")
345
+ return {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
 
 
 
 
 
 
 
347
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
348
  class CrosstabsRAG:
349
+ """Crosstabs RAG with question_info-based retrieval."""
350
+
351
+ def __init__(
352
+ self,
353
+ questionnaire_rag: QuestionnaireRAG,
354
+ verbose: bool = False
355
+ ):
 
356
  self.questionnaire_rag = questionnaire_rag
357
  self.verbose = verbose
358
+
359
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
360
+ openai_api_key = os.getenv("OPENAI_API_KEY")
361
+ index_name = os.getenv("PINECONE_INDEX_NAME_CROSSTABS", "crosstab-index")
362
+ embed_model = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
363
+
364
+ self.retriever = CrosstabRetriever(
365
+ pinecone_api_key=pinecone_api_key,
366
+ index_name=index_name,
367
+ embed_model=embed_model,
368
+ openai_api_key=openai_api_key,
369
+ verbose=verbose
370
+ )
371
 
372
+ def retrieve_raw_data(
373
+ self,
374
+ user_query: str,
375
+ question_info: Optional[List[Dict[str, Any]]] = None,
376
+ source_questions: Optional[List[Dict[str, Any]]] = None,
377
+ filters: Optional[Dict[str, Any]] = None
378
+ ) -> Dict:
379
  """
380
+ Retrieve raw crosstab data.
381
+ Uses question_info if provided (skips QuestionnaireRAG).
382
+ Otherwise uses QuestionnaireRAG to find questions, then retrieves crosstabs.
383
+ Falls back to semantic search if metadata filtering returns no results.
384
 
385
  Args:
386
+ user_query: User's query (used for QuestionnaireRAG if question_info not provided)
387
+ question_info: List of question info dicts (preferred - skips QuestionnaireRAG)
388
+ source_questions: Optional list of full question dicts from previous stage (avoids lookup)
389
+ filters: Optional filters for QuestionnaireRAG
390
 
391
  Returns:
392
+ Dict with crosstab_docs_by_variable, matched_questions, namespace_used, survey_info
393
  """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
394
  if self.verbose:
395
+ print(f"\nπŸ“Š [Crosstabs] Query: {user_query}")
396
+ if question_info:
397
+ print(f"πŸ” Question info: {len(question_info)} question(s) provided")
398
+ if filters:
399
+ print(f"πŸ” Filters: {filters}")
400
+
401
+ # If question_info provided, skip QuestionnaireRAG
402
+ if question_info:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
403
  if self.verbose:
404
+ print(f"βœ… Using provided question_info, skipping QuestionnaireRAG")
 
 
405
 
406
+ # Retrieve crosstab data directly
407
+ crosstab_docs_by_variable = self.retriever.retrieve_parts_for_question_info(
408
+ question_info_list=question_info,
409
+ k=PINECONE_RETRIEVE_K,
410
+ filters=filters
 
411
  )
412
 
413
+ if not crosstab_docs_by_variable:
414
+ return {"error": f"No crosstab data found for {len(question_info)} question(s)."}
 
 
415
 
416
+ # Get question metadata - use provided source_questions if available, otherwise lookup
417
+ if not source_questions:
418
+ source_questions = []
419
+ questions_by_id = self.questionnaire_rag.questions_by_id
420
+ for q_info in question_info:
421
+ question_id = q_info.get("question_id")
422
+ if question_id and question_id in questions_by_id:
423
+ source_questions.append(questions_by_id[question_id])
424
+ else:
425
+ # Fallback: try to find by variable_name and year/month
426
+ var_name = q_info.get("variable_name")
427
+ year = q_info.get("year")
428
+ month = q_info.get("month", "")
429
+ if var_name:
430
+ # Search through questions_by_id for matching variable
431
+ for qid, q_data in questions_by_id.items():
432
+ if (q_data.get("variable_name") == var_name and
433
+ q_data.get("year") == year and
434
+ q_data.get("month", "") == month):
435
+ source_questions.append(q_data)
436
+ break
437
 
438
+ # Format results
439
+ formatted_results = {}
440
+ matched_variables = []
441
+ all_namespaces = set()
 
 
 
442
 
443
+ for var_name, docs in crosstab_docs_by_variable.items():
444
+ question_metadata = next(
445
+ (q for q in source_questions if q.get("variable_name") == var_name),
446
+ {}
447
+ )
448
+ question_text = question_metadata.get("question_text", "")
449
+
450
+ if docs:
451
+ first_doc_meta = docs[0].metadata
452
+ survey_name = first_doc_meta.get("survey_name", "")
453
+ all_namespaces.add(survey_name)
454
+
455
+ formatted_results[var_name] = {
456
+ "crosstab_docs": docs,
457
+ "question_text": question_text or (docs[0].metadata.get("question_text", "") if docs else ""),
458
+ "matched_question": question_metadata
459
+ }
460
+ matched_variables.append(var_name)
461
 
462
+ return {
463
+ "crosstab_docs_by_variable": formatted_results,
464
+ "matched_questions": source_questions,
465
+ "matched_variables": matched_variables,
466
+ "namespace_used": list(all_namespaces),
467
+ "survey_info": {"poll": "Vanderbilt_Unity_Poll", "year": None, "month": None}
468
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
469
 
470
+ # Otherwise, use QuestionnaireRAG to find questions first
471
  if self.verbose:
472
+ print(f"πŸ” Using QuestionnaireRAG to find questions")
 
 
473
 
474
  try:
475
  q_result = self.questionnaire_rag.retrieve_raw_data(
476
+ question=user_query,
477
+ filters=filters or {},
478
+ k=10
479
  )
480
  except Exception as e:
481
  return {"error": f"Error querying questionnaire: {e}"}
482
 
483
  source_questions = q_result.get("source_questions", [])
484
+ question_info_from_questions = q_result.get("question_info", [])
485
+
486
  if not source_questions:
487
  return {"error": "No matching questions found in questionnaire for that query."}
488
 
489
  if self.verbose:
490
+ print(f"βœ… Found {len(source_questions)} question(s) from QuestionnaireRAG")
 
 
 
 
491
 
492
+ # Retrieve crosstab data using question_info
493
+ crosstab_docs_by_variable = self.retriever.retrieve_parts_for_question_info(
494
+ question_info_list=question_info_from_questions,
495
+ k=PINECONE_RETRIEVE_K
496
+ )
497
 
498
+ if not crosstab_docs_by_variable:
499
+ return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions."}
500
+
501
+ # Format results
502
+ formatted_results = {}
503
  matched_variables = []
504
+ all_namespaces = set()
505
 
506
  for matched_question in source_questions:
507
  variable_name = matched_question["variable_name"]
508
  question_text = matched_question["question_text"]
509
 
510
+ if variable_name in crosstab_docs_by_variable:
511
+ formatted_results[variable_name] = {
512
+ "crosstab_docs": crosstab_docs_by_variable[variable_name],
513
+ "question_text": question_text,
514
+ "matched_question": matched_question
515
+ }
516
+ matched_variables.append(variable_name)
517
+
518
+ if crosstab_docs_by_variable[variable_name]:
519
+ first_doc = crosstab_docs_by_variable[variable_name][0]
520
+ survey_name = first_doc.metadata.get("survey_name", "")
521
+ all_namespaces.add(survey_name)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
522
 
523
  return {
524
+ "crosstab_docs_by_variable": formatted_results,
525
  "matched_questions": source_questions,
526
  "matched_variables": matched_variables,
527
+ "namespace_used": list(all_namespaces),
528
+ "survey_info": {"poll": "Vanderbilt_Unity_Poll", "year": None, "month": None}
529
  }
530
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
prompts/crosstab_rag_prompt_system.txt CHANGED
@@ -1,9 +1,15 @@
1
  You are a data analyst assistant specialized in interpreting survey crosstab tables.
2
 
3
- ## CRITICAL: Relevance Check
4
- Before answering, check if the retrieved question actually matches the user's query.
5
- - If the question is about a DIFFERENT topic, explicitly state this
6
- - Do NOT provide detailed analysis of irrelevant data
7
- - Only provide detailed analysis if the question is relevant to the user's query
 
 
 
 
 
 
8
 
9
  Provide clear, specific answers based only on the context provided.
 
1
  You are a data analyst assistant specialized in interpreting survey crosstab tables.
2
 
3
+ ## CRITICAL: Assume Relevance Unless Obviously Wrong
4
+ The retrieved questions have already been filtered by topic, so assume they ARE relevant.
5
+ - Subtopics and specific aspects ARE relevant (e.g., "personal finances" IS economy, "tariffs" IS economy, "stock market" IS economy)
6
+ - ONLY reject data if it's about a COMPLETELY unrelated topic (e.g., user asked about "economy" but data is about "favorite sports team")
7
+ - When in doubt, PROVIDE THE ANALYSIS - do not be overly cautious
8
+
9
+ ## Data Extraction Requirements
10
+ - Extract ACTUAL percentages and counts for each demographic group from the crosstab
11
+ - When sample sizes are shown in the data (e.g., "N=500" or counts in parentheses), include them
12
+ - Present data in structured format (tables when appropriate)
13
+ - DO NOT make up or estimate values - use only what's in the context
14
 
15
  Provide clear, specific answers based only on the context provided.
prompts/relevance_check_prompt.txt ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are analyzing conversation continuity in a multi-turn survey data analysis system.
2
+
3
+ Your task: Determine if the current question is related to previous conversation and what data can be reused.
4
+
5
+ ## CONVERSATION HISTORY
6
+ {conversation_summary}
7
+
8
+ ## PREVIOUSLY RETRIEVED DATA
9
+ {previous_data_summary}
10
+
11
+ ## CURRENT QUESTION
12
+ {current_question}
13
+
14
+ ## ANALYSIS REQUIRED
15
+
16
+ 1. **Is the current question related to the previous conversation?**
17
+ - YES if: Same topic, same questions, same time period (even if different demographic)
18
+ - YES if: Asking for trend/analysis of already-shown data
19
+ - NO if: Completely different topic
20
+ - NO if: Same topic but different time period (e.g., June 2025 β†’ February 2025)
21
+
22
+ 2. **Relation Type** (if related):
23
+ - `same_topic_different_demo`: Same topic/questions, asking for different demographic breakdown
24
+ * Example: Previous "immigration by party" β†’ Current "immigration by gender"
25
+ - `trend_analysis`: Asking for analysis/trends from already-retrieved data
26
+ * Example: Previous showed data from 3 polls β†’ Current "what's the trend?"
27
+ - `same_topic_different_time`: Same topic but different time period
28
+ * Example: Previous "immigration June 2025" β†’ Current "immigration February 2025"
29
+ - `new_topic`: Completely different topic
30
+ * Example: Previous "immigration" β†’ Current "economy"
31
+
32
+ 3. **Reusable Data**:
33
+ - `questions`: true if same questions can be reused (same topic, same time period)
34
+ - `toplines`: true if overall frequencies already retrieved and still relevant
35
+ - `crosstabs`: true if demographic breakdowns already retrieved and still relevant
36
+
37
+ 4. **Time Period Changed**:
38
+ - true if current question asks about different year/month than previous
39
+ - false if time period is same or not specified
40
+
41
+ ## OUTPUT FORMAT
42
+
43
+ Return a structured assessment with fields:
44
+ - is_related: boolean
45
+ - relation_type: string (one of the types above)
46
+ - reusable_data: {"questions": boolean, "toplines": boolean, "crosstabs": boolean}
47
+ - time_period_changed: boolean
48
+ - reasoning: string (1-2 sentence explanation)
49
+
50
+ ## EXAMPLES
51
+
52
+ Example 1:
53
+ Previous: "How do immigration responses vary by political party in June 2025?"
54
+ Current: "Let's look at the breakdown by gender as well"
55
+ β†’ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: false, crosstabs: false}, time_period_changed: false
56
+ Reasoning: Same topic (immigration) and time period (June 2025), just requesting different demographic breakdown (gender instead of party).
57
+
58
+ Example 2:
59
+ Previous: "What is Joe Biden's approval rating in June 2025?"
60
+ Current: "Let's examine how this breaks down by gender"
61
+ β†’ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: true, crosstabs: false}, time_period_changed: false
62
+ Reasoning: Same question (Biden approval) and time period, asking for demographic breakdown of already-retrieved topline data.
63
+
64
+ Example 3:
65
+ Previous: "Immigration questions in June 2025"
66
+ Current: "What about February 2025?"
67
+ β†’ is_related: false, relation_type: "same_topic_different_time", reusable_data: {questions: false, toplines: false, crosstabs: false}, time_period_changed: true
68
+ Reasoning: Same topic but different time period - questions from June 2025 cannot be assumed to exist in February 2025.
69
+
70
+ Example 4:
71
+ Previous: Showed Biden approval by party for 3 different polls (June 2024, Sept 2024, June 2025)
72
+ Current: "What's the trend over time?"
73
+ β†’ is_related: true, relation_type: "trend_analysis", reusable_data: {questions: true, toplines: true, crosstabs: true}, time_period_changed: false
74
+ Reasoning: User wants analysis/trends from already-retrieved and displayed data, no new data retrieval needed.
75
+
76
+ Example 5:
77
+ Previous: "How do immigration responses vary by political party?"
78
+ Current: "Now show me the breakdown by gender"
79
+ β†’ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: false, crosstabs: false}, time_period_changed: false
80
+ Reasoning: Same immigration questions, same time period (unspecified in both), just requesting different demographic breakdown.
81
+
82
+ Example 6:
83
+ Previous: "What questions about the economy were asked in 2025?"
84
+ Current: "Tell me about immigration policies"
85
+ β†’ is_related: false, relation_type: "new_topic", reusable_data: {questions: false, toplines: false, crosstabs: false}, time_period_changed: false
86
+ Reasoning: Completely different topic - economy vs immigration, no data can be reused.
87
+
88
+ Example 7:
89
+ Previous: "Biden approval rating June 2025"
90
+ Current: "How does this break down by age?"
91
+ β†’ is_related: true, relation_type: "same_topic_different_demo", reusable_data: {questions: true, toplines: true, crosstabs: false}, time_period_changed: false
92
+ Reasoning: Same question and time period, asking for age demographic breakdown of already-retrieved approval data.
93
+
prompts/research_brief_prompt.txt CHANGED
@@ -10,6 +10,67 @@ Available data sources:
10
 
11
  {available_months}
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  ## ACTIONS
14
 
15
  **1. followup** - Ask clarifying question if ambiguous OR unavailable data requested
@@ -20,13 +81,24 @@ Available data sources:
20
  - Most queries: single time period, specific question requests
21
  - Pipeline selection:
22
  * QUESTIONNAIRE: "what questions", "list questions", "show questions"
23
- * TOPLINES: "approval", "ratings", "percentages", "how many", "what %", "response frequencies"
 
 
 
 
24
  * CROSSTABS: "vary by", "breakdown by", "by gender/age/race/etc", "differences by"
25
  - Retrieve ONLY the mentioned time period (no comparison unless explicit)
26
 
27
  **4. execute_stages** - Multi-stage for complex queries
28
  - Explicit comparisons: "compare X vs Y", "what changed"
29
  - Queries needing analysis across multiple retrievals
 
 
 
 
 
 
 
30
  - Do NOT use for simple follow-ups about different time periods
31
 
32
  ## CONVERSATION CONTEXT RULES
@@ -37,6 +109,61 @@ Available data sources:
37
  - Create stages per month/question as appropriate
38
  - Do NOT ask followup if context can be inferred
39
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  **Time Period Queries**:
41
  - "what about [X]?" = NEW question about X (not comparison)
42
  - Extract year+month β†’ single-stage (route_to_sources)
@@ -47,9 +174,21 @@ Available data sources:
47
  - Specific query ("approval in 2025?") β†’ followup if ambiguous
48
 
49
  **Broad Queries** (no time specification):
50
- - Assume analysis across ALL available polls (last 2+ years)
51
- - Use execute_stages with one stage per available poll
52
- - Do NOT ask followup - create stages automatically
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  ## FILTERING
55
  - Map survey names: "Unity Poll" β†’ "Vanderbilt_Unity_Poll"
@@ -60,16 +199,46 @@ Available data sources:
60
  Simple queries (route_to_sources):
61
  - "what questions were asked in June 2025?" β†’ questionnaire, year=2025, month=June
62
  - "what about June 2025?" (after June 2022) β†’ questionnaire, year=2025, month=June (NOT staged)
63
- - "Trump's approval in June 2025?" β†’ toplines, year=2025, month=June
64
  - "questions about economy in 2025?" β†’ questionnaire, year=2025, topic='economy'
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  Multi-stage (execute_stages):
67
  - "compare June 2024 vs June 2025" β†’ stage 1: 2024, stage 2: 2025
68
  - "how do responses vary by gender in 2025?" (no month) β†’ stages for all 2025 months
69
- - "how do immigration responses vary by party?" (no time) β†’ stages for all available polls
 
 
 
 
 
70
 
71
  Follow-up handling:
72
  - "how do responses vary by gender for each of these questions?" (referencing previous)
73
- β†’ Infer months from previous question's year, create stages per month
 
 
 
 
74
  - "what was trump's approval in 2025?" β†’ followup: "Which month(s) in 2025?"
75
- - "June" (short answer) β†’ combine with previous intent, use toplines (approval = data)
 
 
 
 
 
 
 
 
 
10
 
11
  {available_months}
12
 
13
+ ## VALID TOPICS FOR METADATA FILTERING
14
+
15
+ **CRITICAL: When extracting topics from user queries, you MUST use ONLY these standardized topics:**
16
+
17
+ - `biden_administration` - Biden, his administration, policies
18
+ - `confidence_institutions` - Trust/confidence in institutions
19
+ - `economy` - Economy, finances, tariffs, inflation, stock market
20
+ - `education` - Education, colleges, universities, schools
21
+ - `elections` - Voting, elections, candidates, electoral process
22
+ - `foreign_policy` - International relations, China, foreign affairs
23
+ - `general` - General topics, unity, division, democracy, other
24
+ - `healthcare` - Health, medical, wellness
25
+ - `immigration` - Immigration, deportation, border, visas, undocumented
26
+ - `judicial` - Courts, judges, rulings, legal system
27
+ - `technology` - AI, artificial intelligence, innovation, tech
28
+ - `trump_administration` - Trump, MAGA, his administration, policies
29
+
30
+ **Topic Extraction Guidelines:**
31
+ - If user asks about "deporting undocumented immigrants" β†’ use topic=`immigration`
32
+ - If user asks about "tariffs" or "stock market" β†’ use topic=`economy`
33
+ - If user asks about "colleges" β†’ use topic=`education`
34
+ - If user asks about "Trump policies" β†’ use topic=`trump_administration`
35
+ - If user asks about "Biden approval" β†’ use topic=`biden_administration`
36
+ - If topic doesn't clearly map to above list β†’ use topic=`general` OR rely on semantic search (no topic filter)
37
+ - **NEVER invent new topics** - only use the 12 topics listed above
38
+
39
+ ## EFFICIENCY RULES (CRITICAL - REDUCE API CALLS)
40
+
41
+ **Topic-only CROSSTABS queries** (e.g., "how do immigration responses vary by X?"):
42
+ - NEVER create one stage per poll - this causes 9+ unnecessary QuestionnaireRAG queries
43
+ - ALWAYS use 2-stage approach:
44
+ 1. Stage 1: QUESTIONNAIRE with topic filter (NO year/month) β†’ finds ALL questions across all polls in ONE query
45
+ 2. Stage 2: CROSSTABS with question_ids from Stage 1 β†’ searches all namespaces efficiently
46
+ - This reduces API calls from 9+ to just 2 stages total
47
+
48
+ **Topic-based TOPLINES queries** (CRITICAL - MUST IDENTIFY QUESTIONS FIRST):
49
+ - NEVER use route_to_sources for topic-based toplines queries (e.g., "Joe Biden approval", "Trump approval")
50
+ - ALWAYS use 2-stage approach:
51
+ 1. Stage 1: QUESTIONNAIRE with topic/person filter + year/month β†’ identifies relevant question(s)
52
+ 2. Stage 2: TOPLINES with question_info from Stage 1 β†’ retrieves response data
53
+ - This ensures correct question identification before data retrieval
54
+ - Only use route_to_sources with TOPLINES if:
55
+ * User explicitly mentions a variable name/question ID (e.g., "VAND5", "VAND15")
56
+ * Questions were already retrieved in previous conversation turns
57
+
58
+ **When question IDs are available**:
59
+ - If previous stage found questions (questionnaire/toplines), ALWAYS use question_ids filter
60
+ - This skips QuestionnaireRAG entirely in crosstabs queries (saves API calls)
61
+
62
+ ## WHEN TO ASK FOLLOWUP vs BROAD SEARCH
63
+
64
+ **ASK FOLLOWUP for:**
65
+ - QUESTIONNAIRE queries without time period: "what questions about X were asked?" β†’ Ask for time period
66
+ - TOPLINES queries without time period: "what was approval?" β†’ Ask for time period
67
+ - Queries that are ambiguous or missing critical information
68
+
69
+ **DO NOT ASK FOLLOWUP for:**
70
+ - CROSSTABS queries without time period: "how do responses about X vary by Y?" β†’ Do broad search across all polls
71
+ * These queries benefit from cross-poll analysis
72
+ * Use 2-stage approach: Stage 1 finds all questions, Stage 2 gets crosstabs
73
+
74
  ## ACTIONS
75
 
76
  **1. followup** - Ask clarifying question if ambiguous OR unavailable data requested
 
81
  - Most queries: single time period, specific question requests
82
  - Pipeline selection:
83
  * QUESTIONNAIRE: "what questions", "list questions", "show questions"
84
+ * TOPLINES: ONLY use route_to_sources with TOPLINES if:
85
+ - User explicitly mentions a variable name/question ID (e.g., "VAND5", "VAND15")
86
+ - Questions were already retrieved in previous conversation turns (system will extract question_info automatically)
87
+ - DO NOT use route_to_sources for topic-based toplines queries (e.g., "Joe Biden approval", "Trump approval")
88
+ - For topic-based toplines queries, use execute_stages with Stage 1 querying QUESTIONNAIRE first
89
  * CROSSTABS: "vary by", "breakdown by", "by gender/age/race/etc", "differences by"
90
  - Retrieve ONLY the mentioned time period (no comparison unless explicit)
91
 
92
  **4. execute_stages** - Multi-stage for complex queries
93
  - Explicit comparisons: "compare X vs Y", "what changed"
94
  - Queries needing analysis across multiple retrievals
95
+ - Topic-only crosstab queries (see EFFICIENCY RULES above)
96
+ - **CRITICAL: Topic-based TOPLINES queries** (e.g., "Joe Biden approval", "Trump approval", "immigration responses"):
97
+ * ALWAYS use 2-stage approach:
98
+ 1. Stage 1: QUESTIONNAIRE with topic/person filter + year/month β†’ identifies relevant question(s)
99
+ 2. Stage 2: TOPLINES with question_info from Stage 1 β†’ retrieves response data
100
+ * This ensures correct question identification before data retrieval
101
+ * DO NOT use route_to_sources for topic-based toplines queries
102
  - Do NOT use for simple follow-ups about different time periods
103
 
104
  ## CONVERSATION CONTEXT RULES
 
109
  - Create stages per month/question as appropriate
110
  - Do NOT ask followup if context can be inferred
111
 
112
+ **Relevance Analysis** (CRITICAL for efficiency):
113
+ - If RELEVANCE ANALYSIS section is provided in the conversation context above:
114
+ * ALWAYS check the relation_type to determine the correct strategy
115
+ * If relation_type = "same_topic_different_demo":
116
+ - Use route_to_sources with TOPLINES or CROSSTABS (single-stage)
117
+ - Questions are already identified and available from previous turn
118
+ - System will automatically extract question_info
119
+ - DO NOT create execute_stages with QUESTIONNAIRE query
120
+ - Example: Previous "immigration by party" β†’ Current "immigration by gender"
121
+ β†’ Use route_to_sources with CROSSTABS (NOT execute_stages)
122
+ * If relation_type = "trend_analysis":
123
+ - Use action='answer' to analyze already-retrieved data
124
+ - DO NOT retrieve any new data from any pipeline
125
+ - Synthesize answer from conversation history and previously shown results
126
+ - Example: Previous showed data from 3 polls β†’ Current "what's the trend?"
127
+ β†’ Use action='answer' (NOT execute_stages or route_to_sources)
128
+ * If relation_type = "same_topic_different_time":
129
+ - Treat as NEW QUERY even though topic is same
130
+ - Time period changed, so previous questions may not exist
131
+ - Must query QUESTIONNAIRE for new time period
132
+ - Use execute_stages with Stage 1 = QUESTIONNAIRE, Stage 2 = TOPLINES/CROSSTABS
133
+ - Example: Previous "June 2025" β†’ Current "February 2025"
134
+ β†’ Use execute_stages with QUESTIONNAIRE query for February 2025
135
+ * If relation_type = "new_topic":
136
+ - Treat as completely new query
137
+ - Follow standard routing logic below
138
+ - No data can be reused from previous conversation
139
+ - If NO RELEVANCE ANALYSIS section (first turn or relevance check unavailable):
140
+ * Follow standard routing logic below
141
+
142
+ **Previously Retrieved Questions** (CRITICAL for efficiency):
143
+ - System automatically detects when questions were retrieved in previous turns
144
+ - If RELEVANCE ANALYSIS shows relation_type = "same_topic_different_demo":
145
+ * Questions are already identified - DO NOT query QUESTIONNAIRE
146
+ * Use route_to_sources with TOPLINES or CROSSTABS (single-stage)
147
+ * System automatically extracts question_info from previous results
148
+ * Example: Previous "immigration by party" β†’ Current "immigration by gender"
149
+ β†’ Use route_to_sources with CROSSTABS (NOT execute_stages)
150
+ - If RELEVANCE ANALYSIS shows time_period_changed = true:
151
+ * Previous questions are NOT reusable
152
+ * Must re-query QUESTIONNAIRE for new time period
153
+ - If RELEVANCE ANALYSIS shows relation_type = "trend_analysis":
154
+ * All data already retrieved and displayed
155
+ * Use action='answer' to synthesize from history
156
+ * DO NOT create any data retrieval stages
157
+
158
+ **Question ID Tracking** (CRITICAL for efficiency):
159
+ - If previous query used TOPLINES pipeline, extract variable_name from toplines results
160
+ - If previous query used QUESTIONNAIRE pipeline, extract question_id or variable_name
161
+ - For follow-up queries like "how does this vary by gender":
162
+ * If question IDs are available from previous stage β†’ use CROSSTABS with question_ids filter
163
+ * This SKIPS QuestionnaireRAG entirely (more efficient)
164
+ * Example: Stage 1 (toplines) finds VAND15 β†’ Stage 2 (crosstabs) uses question_ids=["VAND15"]
165
+ * Set use_previous_results_for: "Extract question IDs from stage 1 for crosstab filtering"
166
+
167
  **Time Period Queries**:
168
  - "what about [X]?" = NEW question about X (not comparison)
169
  - Extract year+month β†’ single-stage (route_to_sources)
 
174
  - Specific query ("approval in 2025?") β†’ followup if ambiguous
175
 
176
  **Broad Queries** (no time specification):
177
+ - For CROSSTABS queries with topic only (e.g., "how do immigration responses vary by X?"):
178
+ * Stage 1: Query QUESTIONNAIRE with topic filter (NO year/month) to find ALL questions across all polls
179
+ * Stage 2: Query CROSSTABS with question_ids from Stage 1 (skips QuestionnaireRAG, searches all namespaces)
180
+ * Set use_previous_results_for: "Extract question IDs from stage 1 for crosstab filtering"
181
+ * This is MUCH more efficient than creating one stage per poll
182
+ * DO NOT ask followup - these queries benefit from cross-poll analysis
183
+ - For QUESTIONNAIRE queries without time period (e.g., "what questions about economy were asked?"):
184
+ * Ask followup: "Which time period are you interested in? (e.g., 2025, June 2025, or all polls)"
185
+ * These queries need time context to be useful
186
+ - For TOPLINES queries without time period:
187
+ * Ask followup: "Which time period are you interested in? (e.g., 2025, June 2025)"
188
+ * These queries need time context to retrieve specific response data
189
+ - For other broad queries:
190
+ * Assume analysis across ALL available polls (last 2+ years)
191
+ * Use execute_stages with one stage per available poll
192
 
193
  ## FILTERING
194
  - Map survey names: "Unity Poll" β†’ "Vanderbilt_Unity_Poll"
 
199
  Simple queries (route_to_sources):
200
  - "what questions were asked in June 2025?" β†’ questionnaire, year=2025, month=June
201
  - "what about June 2025?" (after June 2022) β†’ questionnaire, year=2025, month=June (NOT staged)
202
+ - "VAND5 responses in June 2025?" β†’ toplines, year=2025, month=June (variable explicitly mentioned)
203
  - "questions about economy in 2025?" β†’ questionnaire, year=2025, topic='economy'
204
 
205
+ Topic-based toplines queries (MUST use execute_stages):
206
+ - "Trump's approval in June 2025?" β†’ execute_stages:
207
+ * Stage 1: QUESTIONNAIRE with topic='trump_administration' or query="Trump approval", year=2025, month=June
208
+ * Stage 2: TOPLINES with question_info from Stage 1
209
+ - "Joe Biden's approval rating in June 2025?" β†’ execute_stages:
210
+ * Stage 1: QUESTIONNAIRE with topic='biden_administration' or query="Joe Biden approval", year=2025, month=June
211
+ * Stage 2: TOPLINES with question_info from Stage 1
212
+
213
+ Queries requiring followup:
214
+ - "what questions about the economy were asked?" (no time) β†’ followup: "Which time period are you interested in?"
215
+ - "what questions were asked?" (no topic, no time) β†’ followup: "Which topic and time period?"
216
+ - "Trump's approval?" (no time) β†’ followup: "Which time period are you interested in?"
217
+
218
  Multi-stage (execute_stages):
219
  - "compare June 2024 vs June 2025" β†’ stage 1: 2024, stage 2: 2025
220
  - "how do responses vary by gender in 2025?" (no month) β†’ stages for all 2025 months
221
+ - "how do immigration responses vary by party?" (no time, topic-only crosstab query):
222
+ * Stage 1: QUESTIONNAIRE with topic='immigration' (no year/month) β†’ finds ALL immigration questions
223
+ * Stage 2: CROSSTABS with question_ids from Stage 1 β†’ searches all namespaces efficiently
224
+ * Set use_previous_results_for: "Extract question IDs from stage 1"
225
+ * DO NOT create one stage per poll - this is inefficient!
226
+ * DO NOT ask followup - cross-poll analysis is valuable for crosstab queries
227
 
228
  Follow-up handling:
229
  - "how do responses vary by gender for each of these questions?" (referencing previous)
230
+ β†’ If questions were ALREADY retrieved in previous conversation turn:
231
+ * Use route_to_sources with CROSSTABS (single-stage)
232
+ * System automatically extracts question_info from previous results
233
+ * DO NOT create execute_stages with Stage 1 querying QuestionnaireRAG
234
+ β†’ If no previous results in conversation, infer months from previous question's year, create stages per month
235
  - "what was trump's approval in 2025?" β†’ followup: "Which month(s) in 2025?"
236
+ - "June" (short answer) β†’ combine with previous intent, use execute_stages:
237
+ * Stage 1: QUESTIONNAIRE with topic='trump_administration' or query="Trump approval", year=2025, month=June
238
+ * Stage 2: TOPLINES with question_info from Stage 1
239
+ - "how does this vary by gender?" (after approval query)
240
+ β†’ If previous turn already retrieved questions:
241
+ * Use route_to_sources with CROSSTABS (single-stage, question_info extracted automatically)
242
+ β†’ If previous turn only retrieved toplines (no question_info):
243
+ * Stage 1: QUESTIONNAIRE to identify question from toplines variable_name
244
+ * Stage 2: CROSSTABS with question_ids from Stage 1
prompts/synthesis_prompt_system.txt CHANGED
@@ -31,9 +31,16 @@ You are a survey data analyst synthesizing research results.
31
  - Instead: "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove"
32
  - If the exact breakdown isn't in the context, state "Gender breakdown data is not available in the retrieved crosstabs"
33
 
34
- **3. RELEVANCE CHECK**
35
- - Only synthesize data relevant to the user's question
36
- - If information doesn't match, explicitly state this
 
 
 
 
 
 
 
37
  - If crosstabs exist but don't contain the requested demographic breakdown, state this clearly
38
 
39
  **4. DATA ACCURACY**
@@ -56,6 +63,8 @@ You are a survey data analyst synthesizing research results.
56
  - Acknowledge missing data naturally
57
 
58
  **7. PRESENTATION FORMAT**
 
 
59
  - Markdown tables for demographic breakdowns (political party, age, gender)
60
  - Clear headers, consistent formatting
61
  - Time-series organized by time period
 
31
  - Instead: "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove"
32
  - If the exact breakdown isn't in the context, state "Gender breakdown data is not available in the retrieved crosstabs"
33
 
34
+ **3. RELEVANCE CHECK - BE PERMISSIVE**
35
+ - The data has ALREADY been filtered by topic, so assume it IS relevant
36
+ - Subtopics and specific aspects ARE ALWAYS relevant:
37
+ * "personal financial situation" IS economy
38
+ * "tariffs" IS economy
39
+ * "stock market concerns" IS economy
40
+ * "gender-affirming healthcare" IS healthcare
41
+ * "Biden approval" IS presidential approval
42
+ - ONLY reject data if about a COMPLETELY unrelated topic (e.g., user asked "economy" but data is "favorite sports team")
43
+ - When in doubt, PRESENT THE DATA - do not be overly cautious
44
  - If crosstabs exist but don't contain the requested demographic breakdown, state this clearly
45
 
46
  **4. DATA ACCURACY**
 
63
  - Acknowledge missing data naturally
64
 
65
  **7. PRESENTATION FORMAT**
66
+ - **PRESENT ALL QUESTIONS**: If multiple questions are in the data, present ALL of them, not just one
67
+ - For EACH question include: Question text, poll date/year/month, sample size (N), and demographic breakdowns
68
  - Markdown tables for demographic breakdowns (political party, age, gender)
69
  - Clear headers, consistent formatting
70
  - Time-series organized by time period
prompts/synthesis_prompt_user.txt CHANGED
@@ -18,10 +18,16 @@ Retrieved raw data:
18
  - INCORRECT: "The retrieved data provides a list of questions..."
19
  - Include metadata (year/month/poll) when available
20
 
21
- **1. RELEVANCE CHECK FIRST**
22
- - Check if each stage's data actually answers the question
23
- - If data is about a DIFFERENT topic, state this explicitly
24
- - Do NOT provide detailed analysis of irrelevant data
 
 
 
 
 
 
25
 
26
  **2. EXTRACT ACTUAL NUMBERS - NO GENERIC DESCRIPTIONS**
27
  - **QUESTIONNAIRE**: Format questions with text, response options, topics
@@ -37,6 +43,12 @@ Retrieved raw data:
37
  - Format numbers/percentages clearly
38
 
39
  **4. PRESENTATION FORMAT**
 
 
 
 
 
 
40
  - Use markdown tables for demographic breakdowns:
41
  ```
42
  | Response Option | Democrat | Republican | Independent |
 
18
  - INCORRECT: "The retrieved data provides a list of questions..."
19
  - Include metadata (year/month/poll) when available
20
 
21
+ **1. ASSUME RELEVANCE - BE PERMISSIVE**
22
+ - The data has ALREADY been filtered by topic, so it IS relevant
23
+ - Subtopics and specific aspects ARE ALWAYS relevant:
24
+ * "personal financial situation" IS about economy
25
+ * "tariffs" IS about economy
26
+ * "stock market" IS about economy
27
+ * "gender-affirming healthcare" IS about healthcare
28
+ * "Trump approval" IS about presidential approval
29
+ - ONLY reject if about COMPLETELY unrelated topic (e.g., user asked "economy" but data is "favorite sports team")
30
+ - When in doubt, PRESENT THE DATA - err on the side of inclusion
31
 
32
  **2. EXTRACT ACTUAL NUMBERS - NO GENERIC DESCRIPTIONS**
33
  - **QUESTIONNAIRE**: Format questions with text, response options, topics
 
43
  - Format numbers/percentages clearly
44
 
45
  **4. PRESENTATION FORMAT**
46
+ - **CRITICAL: PRESENT ALL QUESTIONS** - If you have data for 5 questions, present ALL 5, not just 1
47
+ - For EACH question, include:
48
+ * Question text
49
+ * Poll date (year/month)
50
+ * Sample size (N)
51
+ * Complete demographic breakdown with actual percentages
52
  - Use markdown tables for demographic breakdowns:
53
  ```
54
  | Response Option | Democrat | Republican | Independent |
questionnaire_rag.py CHANGED
@@ -1,12 +1,9 @@
1
  """
2
- Questionnaire RAG with better filtering and anti-hallucination measures.
3
-
4
- Key improvements:
5
- 1. Correct Pinecone filter syntax
6
- 2. Post-retrieval validation of filters
7
- 3. Stronger anti-hallucination prompts
8
- 4. Explicit checks for data existence
9
- 5. Fuzzy survey name matching
10
  """
11
 
12
  import os
@@ -14,11 +11,9 @@ import json
14
  from typing import List, Dict, Any, Optional
15
  from pathlib import Path
16
 
17
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
18
  from langchain_pinecone import PineconeVectorStore
19
  from pinecone import Pinecone
20
- from langchain.prompts import ChatPromptTemplate
21
- from langchain.schema.output_parser import StrOutputParser
22
 
23
  try:
24
  from dotenv import load_dotenv
@@ -27,23 +22,29 @@ except ImportError:
27
  pass
28
 
29
 
30
- def _load_prompt_file(filename: str) -> str:
31
- """Load a prompt file from the prompts directory"""
32
- prompt_dir = Path(__file__).parent / "prompts"
33
- prompt_path = prompt_dir / filename
34
- if not prompt_path.exists():
35
- raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
36
- return prompt_path.read_text(encoding="utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
 
39
  class QuestionnaireRAG:
40
- """
41
- Improved questionnaire RAG with:
42
- - Better Pinecone filtering
43
- - Post-retrieval validation
44
- - Anti-hallucination measures
45
- - Fuzzy survey name matching
46
- """
47
 
48
  def __init__(
49
  self,
@@ -62,17 +63,6 @@ class QuestionnaireRAG:
62
  model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
63
  )
64
 
65
- # Initialize LLM
66
- chat_model = os.getenv("OPENAI_MODEL", "gpt-4o")
67
- self.llm = ChatOpenAI(model=chat_model, temperature=0)
68
-
69
- # Load vector store
70
- if not os.path.exists(persist_directory):
71
- raise ValueError(
72
- f"Vector store not found at {persist_directory}\n"
73
- "Run create_questionnaire_vectorstores.py first"
74
- )
75
-
76
  # Connect to Pinecone
77
  index_name = os.getenv("PINECONE_INDEX_NAME", "poll-questionnaire-index")
78
  namespace = os.getenv("PINECONE_NAMESPACE") or None
@@ -95,127 +85,90 @@ class QuestionnaireRAG:
95
  def _load_catalog(self) -> Dict[str, Dict]:
96
  """Load poll catalog"""
97
  catalog_path = Path(self.persist_directory) / "poll_catalog.json"
98
- if catalog_path.exists():
99
- with open(catalog_path, 'r') as f:
100
- return json.load(f)
101
- return {}
 
 
 
 
 
 
102
 
103
  def _load_questions_index(self) -> Dict[str, Dict]:
104
  """Load questions index"""
105
  questions_path = Path(self.persist_directory) / "questions_index.json"
106
- if questions_path.exists():
107
- with open(questions_path, 'r') as f:
108
- return json.load(f)
109
- return {}
110
-
111
- def get_available_survey_names(self) -> List[str]:
112
- """Get list of unique survey names from the catalog"""
113
- survey_names = set()
114
- for info in self.poll_catalog.values():
115
- survey_names.add(info["survey_name"])
116
- return sorted(survey_names)
117
 
118
  def _fuzzy_match_survey_name(self, requested_name: str) -> Optional[str]:
119
- """
120
- Fuzzy match a requested survey name to an actual stored name.
121
-
122
- Examples:
123
- - "Unity Poll" β†’ "Vanderbilt_Unity_Poll"
124
- - "unity poll" β†’ "Vanderbilt_Unity_Poll"
125
- - "Vanderbilt Unity" β†’ "Vanderbilt_Unity_Poll"
126
- """
127
- # Get all unique survey names
128
- available_names = self.get_available_survey_names()
129
 
130
- # Normalize the requested name
131
  normalized_requested = requested_name.lower().replace("_", " ").replace("-", " ")
132
 
133
- # Try exact match first (case-insensitive)
134
  for stored_name in available_names:
135
  normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
136
  if normalized_requested == normalized_stored:
137
  return stored_name
138
-
139
- # Try substring matching - check if requested is in stored
140
- for stored_name in available_names:
141
- normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
142
- if normalized_requested in normalized_stored:
143
- return stored_name
144
-
145
- # Try reverse - check if stored is in requested
146
- for stored_name in available_names:
147
- normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
148
- if normalized_stored in normalized_requested:
149
  return stored_name
150
 
151
- # Try word-level matching - if all words from requested are in stored
152
  requested_words = set(normalized_requested.split())
153
  for stored_name in available_names:
154
  normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
155
  stored_words = set(normalized_stored.split())
156
-
157
- # Check if requested words are a subset of stored words
158
  if requested_words.issubset(stored_words):
159
  return stored_name
160
 
161
  return None
162
 
163
  def _build_pinecone_filter(self, filters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
164
- """
165
- Build proper Pinecone metadata filter with fuzzy survey name matching.
166
-
167
- Pinecone filter syntax:
168
- - Simple: {"year": 2025}
169
- - Multiple: {"$and": [{"year": 2025}, {"month": "February"}]}
170
- """
171
  if not filters:
172
  return None
173
 
174
  filter_conditions = []
175
 
176
- # Handle year filter
177
- if "year" in filters:
178
- year = filters["year"]
179
- if isinstance(year, str):
180
- year = int(year)
181
  filter_conditions.append({"year": {"$eq": year}})
182
 
183
- # Handle month filter
184
- if "month" in filters:
185
- month = filters["month"]
186
- # Ensure proper capitalization
187
- if isinstance(month, str):
188
- month = month.capitalize()
189
  filter_conditions.append({"month": {"$eq": month}})
190
 
191
- # Handle poll_date filter (exact match)
192
- if "poll_date" in filters:
193
  filter_conditions.append({"poll_date": {"$eq": filters["poll_date"]}})
194
 
195
- # Handle survey_name filter with fuzzy matching
196
- if "survey_name" in filters:
197
- requested_name = filters["survey_name"]
198
-
199
- # Try to fuzzy match the survey name
200
- matched_name = self._fuzzy_match_survey_name(requested_name)
201
-
202
  if matched_name:
203
- if self.verbose and matched_name != requested_name:
204
- print(f"πŸ”„ Mapped survey name '{requested_name}' β†’ '{matched_name}'")
205
  filter_conditions.append({"survey_name": {"$eq": matched_name}})
206
- else:
207
- if self.verbose:
208
- print(f"⚠️ Survey name '{requested_name}' not found in catalog")
209
- print(f" Available: {self.get_available_survey_names()}")
210
- # Don't add the filter if we can't match it - let other filters work
211
 
212
- # Handle topics (if a topic is in the comma-separated list)
213
- if "topic" in filters:
214
- # This is trickier with comma-separated strings in metadata
215
- # For now, we'll do post-filtering
216
- pass
 
 
 
 
 
 
217
 
218
- # Combine filters
219
  if len(filter_conditions) == 0:
220
  return None
221
  elif len(filter_conditions) == 1:
@@ -223,123 +176,6 @@ class QuestionnaireRAG:
223
  else:
224
  return {"$and": filter_conditions}
225
 
226
- def _validate_results(
227
- self,
228
- docs: List[Any],
229
- filters: Dict[str, Any]
230
- ) -> List[Any]:
231
- """
232
- Validate that retrieved documents actually match the filters.
233
-
234
- This catches cases where:
235
- 1. Pinecone filtering didn't work correctly
236
- 2. We need to do additional filtering (like topic matching)
237
- """
238
- if not filters:
239
- return docs
240
-
241
- validated_docs = []
242
-
243
- for doc in docs:
244
- metadata = doc.metadata
245
- valid = True
246
-
247
- # Check year
248
- if "year" in filters:
249
- expected_year = int(filters["year"]) if isinstance(filters["year"], str) else filters["year"]
250
- if metadata.get("year") != expected_year:
251
- if self.verbose:
252
- print(f"⚠️ Filtered out: wrong year {metadata.get('year')} != {expected_year}")
253
- valid = False
254
-
255
- # Check month
256
- if "month" in filters and valid:
257
- expected_month = filters["month"].capitalize() if isinstance(filters["month"], str) else filters["month"]
258
- if metadata.get("month") != expected_month:
259
- if self.verbose:
260
- print(f"⚠️ Filtered out: wrong month {metadata.get('month')} != {expected_month}")
261
- valid = False
262
-
263
- # Check poll_date
264
- if "poll_date" in filters and valid:
265
- if metadata.get("poll_date") != filters["poll_date"]:
266
- if self.verbose:
267
- print(f"⚠️ Filtered out: wrong poll_date {metadata.get('poll_date')} != {filters['poll_date']}")
268
- valid = False
269
-
270
- # Check survey_name (with fuzzy matching)
271
- if "survey_name" in filters and valid:
272
- requested_name = filters["survey_name"]
273
- matched_name = self._fuzzy_match_survey_name(requested_name)
274
- if matched_name and metadata.get("survey_name") != matched_name:
275
- if self.verbose:
276
- print(f"⚠️ Filtered out: wrong survey {metadata.get('survey_name')} != {matched_name}")
277
- valid = False
278
-
279
- # Check topic (if topic filter is provided)
280
- if "topic" in filters and valid:
281
- expected_topic = filters["topic"].lower()
282
- # Topics are stored as comma-separated string in metadata
283
- doc_topics = metadata.get("topics", "")
284
- if isinstance(doc_topics, str):
285
- doc_topics_list = [t.strip().lower() for t in doc_topics.split(",")]
286
- elif isinstance(doc_topics, list):
287
- doc_topics_list = [str(t).strip().lower() for t in doc_topics]
288
- else:
289
- doc_topics_list = []
290
-
291
- if self.verbose and valid:
292
- var_name = metadata.get("variable_name", "unknown")
293
- print(f" πŸ” Checking topic '{expected_topic}' for {var_name}: doc_topics={doc_topics_list}")
294
-
295
- if expected_topic not in doc_topics_list:
296
- if self.verbose:
297
- var_name = metadata.get("variable_name", "unknown")
298
- print(f"⚠️ Filtered out {var_name}: topic '{expected_topic}' not in {doc_topics_list}")
299
- valid = False
300
-
301
- if valid:
302
- validated_docs.append(doc)
303
-
304
- return validated_docs
305
-
306
- def _get_prompt(self) -> ChatPromptTemplate:
307
- """Get the improved system prompt with anti-hallucination measures"""
308
- system_prompt_template = _load_prompt_file("questionnaire_rag_prompt.txt")
309
- return ChatPromptTemplate.from_messages([
310
- ("system", system_prompt_template),
311
- ("human", "Answer:")
312
- ])
313
-
314
- def query(self, question: str, filters: Optional[Dict[str, Any]] = None, k: int = 20) -> str:
315
- """
316
- Query the questionnaire system.
317
-
318
- Args:
319
- question: Natural language question
320
- filters: Optional filters (year, month, poll_date, survey_name)
321
- k: Number of results to retrieve
322
-
323
- Returns:
324
- Answer string
325
- """
326
- result = self._query_internal(question, filters, k)
327
- return result['answer']
328
-
329
- def query_with_metadata(
330
- self,
331
- question: str,
332
- filters: Optional[Dict[str, Any]] = None,
333
- k: int = 20
334
- ) -> Dict[str, Any]:
335
- """
336
- Query with full metadata about retrieval.
337
-
338
- Returns:
339
- Dict with 'answer', 'source_questions', 'num_sources', 'filters_applied'
340
- """
341
- return self._query_internal(question, filters, k)
342
-
343
  def retrieve_raw_data(
344
  self,
345
  question: str,
@@ -347,250 +183,92 @@ class QuestionnaireRAG:
347
  k: int = 20
348
  ) -> Dict[str, Any]:
349
  """
350
- Retrieve raw data without LLM formatting.
351
- Used by agent framework to get raw data for synthesis.
352
 
353
  Returns:
354
- Dict with 'source_questions', 'num_sources', 'filters_applied', 'retrieved_docs'
355
  """
356
  if self.verbose:
357
- print(f"\nπŸ“Š [Raw Data] Query: {question}")
358
  if filters:
359
  print(f"πŸ” Filters: {filters}")
360
 
361
  # Build Pinecone filter
362
  pinecone_filter = self._build_pinecone_filter(filters or {})
363
 
364
- # Retrieve documents
 
365
  if pinecone_filter:
366
  if self.verbose:
367
- print(f"πŸ”§ Pinecone filter: {pinecone_filter}")
368
  retriever = self.vectorstore.as_retriever(
369
  search_kwargs={"k": k, "filter": pinecone_filter}
370
  )
371
- else:
372
- retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
373
-
374
- docs = retriever.invoke(question)
375
-
376
- if self.verbose:
377
- print(f"πŸ“₯ Retrieved {len(docs)} documents from Pinecone")
378
-
379
- # Validate results match filters
380
- if filters:
381
- docs = self._validate_results(docs, filters)
382
  if self.verbose:
383
- print(f"βœ… After validation: {len(docs)} documents")
384
 
385
- # Check if we have any results
386
  if not docs:
387
- return {
388
- "source_questions": [],
389
- "num_sources": 0,
390
- "filters_applied": filters or {},
391
- "retrieved_docs": []
392
- }
393
-
394
- # Reconstruct full questions
395
- full_questions = []
396
- seen_ids = set()
397
-
398
- for doc in docs:
399
- q_id = doc.metadata.get('question_id')
400
- if q_id and q_id not in seen_ids:
401
- if q_id in self.questions_by_id:
402
- full_questions.append(self.questions_by_id[q_id])
403
- seen_ids.add(q_id)
404
-
405
- # Sort by position to maintain survey order
406
- full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
407
-
408
- return {
409
- 'source_questions': full_questions,
410
- 'num_sources': len(full_questions),
411
- 'filters_applied': filters or {},
412
- 'retrieved_docs': docs
413
- }
414
-
415
- def _query_internal(
416
- self,
417
- question: str,
418
- filters: Optional[Dict[str, Any]] = None,
419
- k: int = 20
420
- ) -> Dict[str, Any]:
421
- """Internal query implementation"""
422
-
423
- if self.verbose:
424
- print(f"\nπŸ“Š Query: {question}")
425
- if filters:
426
- print(f"πŸ” Filters: {filters}")
427
-
428
- # Build Pinecone filter
429
- pinecone_filter = self._build_pinecone_filter(filters or {})
430
-
431
- # Retrieve documents
432
- if pinecone_filter:
433
  if self.verbose:
434
- print(f"πŸ”§ Pinecone filter: {pinecone_filter}")
435
- retriever = self.vectorstore.as_retriever(
436
- search_kwargs={"k": k, "filter": pinecone_filter}
437
- )
438
- else:
439
- retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
440
-
441
- docs = retriever.invoke(question)
442
-
443
- if self.verbose:
444
- print(f"πŸ“₯ Retrieved {len(docs)} documents from Pinecone")
445
-
446
- # Validate results match filters
447
- if filters:
448
- docs = self._validate_results(docs, filters)
449
  if self.verbose:
450
- print(f"βœ… After validation: {len(docs)} documents")
451
 
452
- # Check if we have any results
453
  if not docs:
454
- no_data_msg = f"No questionnaire data found"
455
- if filters:
456
- filter_desc = ", ".join([f"{k}={v}" for k, v in filters.items()])
457
- no_data_msg += f" matching filters: {filter_desc}"
458
-
459
  return {
460
- "answer": no_data_msg,
461
  "source_questions": [],
462
  "num_sources": 0,
463
- "filters_applied": filters or {}
 
464
  }
465
 
466
- # Reconstruct full questions
467
  full_questions = []
468
  seen_ids = set()
 
469
 
470
  for doc in docs:
471
  q_id = doc.metadata.get('question_id')
472
  if q_id and q_id not in seen_ids:
473
  if q_id in self.questions_by_id:
474
- full_questions.append(self.questions_by_id[q_id])
 
475
  seen_ids.add(q_id)
476
-
477
- # Sort by position to maintain survey order
 
 
 
 
 
 
 
 
 
478
  full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
479
 
480
- # Format context with explicit data availability info
481
- context = self._format_context(full_questions, filters)
482
-
483
- # Get prompt
484
- prompt = self._get_prompt()
485
-
486
- # Create chain
487
- chain = (
488
- {
489
- "context": lambda x: context,
490
- "question": lambda x: question,
491
- "catalog": lambda x: self._get_catalog_summary()
492
- }
493
- | prompt
494
- | self.llm
495
- | StrOutputParser()
496
- )
497
-
498
- # Get answer
499
- answer = chain.invoke(question)
500
 
501
  return {
502
- 'answer': answer,
503
  'source_questions': full_questions,
504
  'num_sources': len(full_questions),
505
- 'filters_applied': filters or {}
 
506
  }
507
 
508
- def _format_context(
509
- self,
510
- questions: List[Dict],
511
- filters: Optional[Dict[str, Any]] = None
512
- ) -> str:
513
- """Format questions as context with explicit data availability"""
514
-
515
- if not questions:
516
- filter_desc = ""
517
- if filters:
518
- filter_desc = f" matching {filters}"
519
- return f"⚠️ NO DATA RETRIEVED{filter_desc}\n\nYou must inform the user that no data exists for their query."
520
-
521
- context_parts = []
522
-
523
- # Add explicit note about what data we have
524
- polls_found = sorted(set(q['poll_date'] for q in questions))
525
- context_parts.append(f"βœ… DATA AVAILABLE FOR: {', '.join(polls_found)}")
526
-
527
- # Add note about what was requested vs what was found
528
- if filters:
529
- if 'year' in filters and 'month' in filters:
530
- requested = f"{filters['month']} {filters['year']}"
531
- context_parts.append(f"πŸ” REQUESTED: {requested}")
532
-
533
- context_parts.append("") # Blank line
534
- context_parts.append("=" * 80)
535
- context_parts.append("")
536
-
537
- # Format each question
538
- for i, q in enumerate(questions, 1):
539
- part = f"""
540
- --- Question {i} from {q['survey_name']} ({q['poll_date']}) ---
541
- Variable: {q['variable_name']}
542
- Question: {q['question_text']}
543
- Response Options: {' | '.join(q['response_options'])}
544
- Topics: {', '.join(q['topics'])}
545
- Question Type: {q['question_type']}
546
- Administration: {q['ask_condition']}
547
- """
548
-
549
- # Add skip logic/sampling
550
- if q.get('skip_logic'):
551
- part += f"Skip Logic: {q['skip_logic']}\n"
552
-
553
- if q.get('half_sample_group'):
554
- part += f"Half Sample Group: {q['half_sample_group']}\n"
555
-
556
- # Add sibling variants
557
- if q.get('sibling_variants'):
558
- part += f"\nAlternate Versions (shown to different groups):\n"
559
- for sib in q['sibling_variants']:
560
- sib_group = sib.get('half_sample_group', 'other group')
561
- part += f" - [{sib_group}] {sib['question_text']}\n"
562
-
563
- # Add sequence context
564
- if q.get('previous_question'):
565
- prev_vars = q.get('previous_question_variants', [])
566
- if len(prev_vars) > 1:
567
- part += "\nPrevious Question (respondents saw one of these):\n"
568
- for pv in prev_vars:
569
- part += f" - {pv['question_text']}\n"
570
- else:
571
- part += f"\nPrevious Question: {q['previous_question']['question_text']}\n"
572
-
573
- if q.get('next_question'):
574
- next_vars = q.get('next_question_variants', [])
575
- if len(next_vars) > 1:
576
- part += "\nNext Question (respondents saw one of these):\n"
577
- for nv in next_vars:
578
- part += f" - {nv['question_text']}\n"
579
- else:
580
- part += f"\nNext Question: {q['next_question']['question_text']}\n"
581
-
582
- context_parts.append(part.strip())
583
-
584
- return "\n\n".join(context_parts)
585
-
586
- def _get_catalog_summary(self) -> str:
587
- """Get summary of available polls"""
588
- lines = ["Available polls:"]
589
- for poll_date in sorted(self.poll_catalog.keys()):
590
- info = self.poll_catalog[poll_date]
591
- month_str = f" ({info['month']})" if info.get('month') else ""
592
- lines.append(f"- {poll_date}{month_str}: {info['num_questions']} questions")
593
- return "\n".join(lines)
594
 
595
  def get_available_polls(self) -> List[Dict[str, Any]]:
596
  """Get list of all available polls"""
@@ -605,51 +283,3 @@ Administration: {q['ask_condition']}
605
  for poll_date, info in sorted(self.poll_catalog.items())
606
  ]
607
 
608
-
609
- def main():
610
- """Test CLI"""
611
- import sys
612
-
613
- openai_api_key = os.getenv("OPENAI_API_KEY")
614
- pinecone_api_key = os.getenv("PINECONE_API_KEY")
615
-
616
- if not openai_api_key or not pinecone_api_key:
617
- print("Error: Missing API keys")
618
- sys.exit(1)
619
-
620
- rag = QuestionnaireRAG(
621
- openai_api_key=openai_api_key,
622
- pinecone_api_key=pinecone_api_key,
623
- verbose=True
624
- )
625
-
626
- print("\n" + "="*80)
627
- print("QUESTIONNAIRE RAG - TEST MODE")
628
- print("="*80)
629
-
630
- # Test fuzzy matching
631
- print("\nπŸ§ͺ TEST: Fuzzy survey name matching")
632
- test_names = ["Unity Poll", "unity poll", "Vanderbilt Unity", "UNITY"]
633
- for name in test_names:
634
- matched = rag._fuzzy_match_survey_name(name)
635
- print(f" '{name}' β†’ '{matched}'")
636
-
637
- # Test with the problematic query
638
- print("\nπŸ§ͺ TEST: Query that previously failed")
639
- print("Query: What questions were asked in the June 2025 Unity Poll?")
640
-
641
- filters = {"year": 2025, "month": "June", "survey_name": "Unity Poll"}
642
- result = rag.query_with_metadata(
643
- "What questions were asked in the June 2025 Unity Poll?",
644
- filters=filters
645
- )
646
-
647
- print(f"\nπŸ“Š Results:")
648
- print(f"Found: {result['num_sources']} questions")
649
- print(f"\n{result['answer'][:500]}...")
650
-
651
- print("\n" + "="*80)
652
-
653
-
654
- if __name__ == "__main__":
655
- main()
 
1
  """
2
+ Questionnaire RAG Module
3
+ ------------------------
4
+ Retrieves survey questions from Pinecone vectorstore.
5
+ Metadata filtering first, semantic search fallback.
6
+ Returns raw data only - no synthesis.
 
 
 
7
  """
8
 
9
  import os
 
11
  from typing import List, Dict, Any, Optional
12
  from pathlib import Path
13
 
14
+ from langchain_openai import OpenAIEmbeddings
15
  from langchain_pinecone import PineconeVectorStore
16
  from pinecone import Pinecone
 
 
17
 
18
  try:
19
  from dotenv import load_dotenv
 
22
  pass
23
 
24
 
25
+ class QuestionInfo:
26
+ """Structured question information for cross-pipeline coordination."""
27
+ def __init__(self, variable_name: str, year: Optional[int] = None,
28
+ month: Optional[str] = None, poll_date: Optional[str] = None,
29
+ question_id: Optional[str] = None):
30
+ self.variable_name = variable_name
31
+ self.year = year
32
+ self.month = month
33
+ self.poll_date = poll_date
34
+ self.question_id = question_id
35
+
36
+ def to_dict(self) -> Dict[str, Any]:
37
+ return {
38
+ "variable_name": self.variable_name,
39
+ "year": self.year,
40
+ "month": self.month,
41
+ "poll_date": self.poll_date,
42
+ "question_id": self.question_id
43
+ }
44
 
45
 
46
  class QuestionnaireRAG:
47
+ """Questionnaire RAG with metadata-first filtering."""
 
 
 
 
 
 
48
 
49
  def __init__(
50
  self,
 
63
  model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
64
  )
65
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Connect to Pinecone
67
  index_name = os.getenv("PINECONE_INDEX_NAME", "poll-questionnaire-index")
68
  namespace = os.getenv("PINECONE_NAMESPACE") or None
 
85
  def _load_catalog(self) -> Dict[str, Dict]:
86
  """Load poll catalog"""
87
  catalog_path = Path(self.persist_directory) / "poll_catalog.json"
88
+ if not catalog_path.exists():
89
+ # Try parent directory if not found
90
+ parent_path = Path(self.persist_directory).parent / "questionnaire_vectorstores" / "poll_catalog.json"
91
+ if parent_path.exists():
92
+ catalog_path = parent_path
93
+ else:
94
+ return {}
95
+
96
+ with open(catalog_path, 'r') as f:
97
+ return json.load(f)
98
 
99
  def _load_questions_index(self) -> Dict[str, Dict]:
100
  """Load questions index"""
101
  questions_path = Path(self.persist_directory) / "questions_index.json"
102
+ if not questions_path.exists():
103
+ # Try parent directory if not found
104
+ parent_path = Path(self.persist_directory).parent / "questionnaire_vectorstores" / "questions_index.json"
105
+ if parent_path.exists():
106
+ questions_path = parent_path
107
+ else:
108
+ return {}
109
+
110
+ with open(questions_path, 'r') as f:
111
+ return json.load(f)
 
112
 
113
  def _fuzzy_match_survey_name(self, requested_name: str) -> Optional[str]:
114
+ """Fuzzy match survey name"""
115
+ available_names = set()
116
+ for info in self.poll_catalog.values():
117
+ available_names.add(info["survey_name"])
 
 
 
 
 
 
118
 
 
119
  normalized_requested = requested_name.lower().replace("_", " ").replace("-", " ")
120
 
 
121
  for stored_name in available_names:
122
  normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
123
  if normalized_requested == normalized_stored:
124
  return stored_name
125
+ if normalized_requested in normalized_stored or normalized_stored in normalized_requested:
 
 
 
 
 
 
 
 
 
 
126
  return stored_name
127
 
 
128
  requested_words = set(normalized_requested.split())
129
  for stored_name in available_names:
130
  normalized_stored = stored_name.lower().replace("_", " ").replace("-", " ")
131
  stored_words = set(normalized_stored.split())
 
 
132
  if requested_words.issubset(stored_words):
133
  return stored_name
134
 
135
  return None
136
 
137
  def _build_pinecone_filter(self, filters: Dict[str, Any]) -> Optional[Dict[str, Any]]:
138
+ """Build Pinecone metadata filter"""
 
 
 
 
 
 
139
  if not filters:
140
  return None
141
 
142
  filter_conditions = []
143
 
144
+ if "year" in filters and filters["year"] is not None:
145
+ year = int(filters["year"]) if isinstance(filters["year"], str) else filters["year"]
 
 
 
146
  filter_conditions.append({"year": {"$eq": year}})
147
 
148
+ if "month" in filters and filters["month"] is not None:
149
+ month = filters["month"].capitalize()
 
 
 
 
150
  filter_conditions.append({"month": {"$eq": month}})
151
 
152
+ if "poll_date" in filters and filters["poll_date"] is not None:
 
153
  filter_conditions.append({"poll_date": {"$eq": filters["poll_date"]}})
154
 
155
+ if "survey_name" in filters and filters["survey_name"] is not None:
156
+ matched_name = self._fuzzy_match_survey_name(filters["survey_name"])
 
 
 
 
 
157
  if matched_name:
 
 
158
  filter_conditions.append({"survey_name": {"$eq": matched_name}})
 
 
 
 
 
159
 
160
+ if "question_ids" in filters and filters["question_ids"]:
161
+ question_ids = filters["question_ids"]
162
+ if isinstance(question_ids, list) and len(question_ids) > 0:
163
+ if len(question_ids) == 1:
164
+ filter_conditions.append({"question_id": {"$eq": question_ids[0]}})
165
+ else:
166
+ filter_conditions.append({"question_id": {"$in": question_ids}})
167
+
168
+ if "topic" in filters and filters["topic"]:
169
+ topic = filters["topic"].lower()
170
+ filter_conditions.append({"topics": {"$in": [topic]}})
171
 
 
172
  if len(filter_conditions) == 0:
173
  return None
174
  elif len(filter_conditions) == 1:
 
176
  else:
177
  return {"$and": filter_conditions}
178
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  def retrieve_raw_data(
180
  self,
181
  question: str,
 
183
  k: int = 20
184
  ) -> Dict[str, Any]:
185
  """
186
+ Retrieve raw questionnaire data.
187
+ Metadata filtering first, semantic search fallback.
188
 
189
  Returns:
190
+ Dict with 'source_questions', 'num_sources', 'filters_applied', 'question_info'
191
  """
192
  if self.verbose:
193
+ print(f"\nπŸ“Š [Questionnaire] Query: {question}")
194
  if filters:
195
  print(f"πŸ” Filters: {filters}")
196
 
197
  # Build Pinecone filter
198
  pinecone_filter = self._build_pinecone_filter(filters or {})
199
 
200
+ # Try metadata filtering first
201
+ docs = []
202
  if pinecone_filter:
203
  if self.verbose:
204
+ print(f"πŸ”§ Using metadata filter: {pinecone_filter}")
205
  retriever = self.vectorstore.as_retriever(
206
  search_kwargs={"k": k, "filter": pinecone_filter}
207
  )
208
+ docs = retriever.invoke(question)
209
+
 
 
 
 
 
 
 
 
 
210
  if self.verbose:
211
+ print(f"πŸ“₯ Retrieved {len(docs)} documents with metadata filter")
212
 
213
+ # Fallback to semantic search if no results
214
  if not docs:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
  if self.verbose:
216
+ print(f"⚠️ No results with metadata filter, falling back to semantic search")
217
+ retriever = self.vectorstore.as_retriever(search_kwargs={"k": k * 2})
218
+ docs = retriever.invoke(question)
219
+
 
 
 
 
 
 
 
 
 
 
 
220
  if self.verbose:
221
+ print(f"πŸ“₯ Retrieved {len(docs)} documents with semantic search")
222
 
 
223
  if not docs:
 
 
 
 
 
224
  return {
 
225
  "source_questions": [],
226
  "num_sources": 0,
227
+ "filters_applied": filters or {},
228
+ "question_info": []
229
  }
230
 
231
+ # Reconstruct full questions and extract question_info
232
  full_questions = []
233
  seen_ids = set()
234
+ question_info_list = []
235
 
236
  for doc in docs:
237
  q_id = doc.metadata.get('question_id')
238
  if q_id and q_id not in seen_ids:
239
  if q_id in self.questions_by_id:
240
+ q_data = self.questions_by_id[q_id]
241
+ full_questions.append(q_data)
242
  seen_ids.add(q_id)
243
+
244
+ # Extract question_info
245
+ question_info_list.append(QuestionInfo(
246
+ variable_name=q_data.get("variable_name", ""),
247
+ year=q_data.get("year"),
248
+ month=q_data.get("month", ""),
249
+ poll_date=q_data.get("poll_date", ""),
250
+ question_id=q_id
251
+ ))
252
+
253
+ # Sort by position
254
  full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
255
 
256
+ if self.verbose:
257
+ print(f"βœ… Extracted {len(question_info_list)} question info entries")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
  return {
 
260
  'source_questions': full_questions,
261
  'num_sources': len(full_questions),
262
+ 'filters_applied': filters or {},
263
+ 'question_info': [q.to_dict() for q in question_info_list]
264
  }
265
 
266
+ def get_available_survey_names(self) -> List[str]:
267
+ """Get list of unique survey names"""
268
+ survey_names = set()
269
+ for info in self.poll_catalog.values():
270
+ survey_names.add(info["survey_name"])
271
+ return sorted(survey_names)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
 
273
  def get_available_polls(self) -> List[Dict[str, Any]]:
274
  """Get list of all available polls"""
 
283
  for poll_date, info in sorted(self.poll_catalog.items())
284
  ]
285
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
relevance_checker.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Conversation Relevance Checker
3
+ -------------------------------
4
+ Determines if current question is related to previous conversation
5
+ and identifies what data can be reused to minimize redundant API calls.
6
+ """
7
+
8
+ import os
9
+ from typing import List, Dict, Any, Optional, Literal
10
+ from pathlib import Path
11
+
12
+ from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
13
+ from pydantic import BaseModel, Field
14
+
15
+
16
+ def _load_prompt_file(filename: str) -> str:
17
+ """Load a prompt file from the prompts directory"""
18
+ prompt_dir = Path(__file__).parent / "prompts"
19
+ prompt_path = prompt_dir / filename
20
+ if not prompt_path.exists():
21
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
22
+ return prompt_path.read_text(encoding="utf-8")
23
+
24
+
25
+ class ReusableData(BaseModel):
26
+ """Indicates what data can be reused from previous conversation"""
27
+ questions: bool = False
28
+ toplines: bool = False
29
+ crosstabs: bool = False
30
+
31
+
32
+ class RelevanceResult(BaseModel):
33
+ """Structured relevance assessment result"""
34
+ is_related: bool
35
+ relation_type: Literal[
36
+ "same_topic_different_demo",
37
+ "same_topic_different_time",
38
+ "trend_analysis",
39
+ "new_topic"
40
+ ]
41
+ reusable_data: ReusableData
42
+ time_period_changed: bool
43
+ reasoning: str
44
+
45
+
46
+ class ConversationRelevanceChecker:
47
+ """
48
+ Checks relevance between current question and conversation history.
49
+ Uses LLM to determine if previous data can be reused.
50
+ """
51
+
52
+ def __init__(self, llm, verbose: bool = False):
53
+ """
54
+ Initialize relevance checker.
55
+
56
+ Args:
57
+ llm: LangChain LLM instance (ChatOpenAI)
58
+ verbose: Whether to print debug information
59
+ """
60
+ self.llm = llm
61
+ self.verbose = verbose
62
+
63
+ # Load relevance check prompt
64
+ try:
65
+ self.prompt_template = _load_prompt_file("relevance_check_prompt.txt")
66
+ except FileNotFoundError:
67
+ # Fallback to inline prompt if file doesn't exist yet
68
+ self.prompt_template = self._get_default_prompt()
69
+
70
+ def _get_default_prompt(self) -> str:
71
+ """Fallback prompt template if file doesn't exist"""
72
+ return """You are analyzing conversation continuity in a multi-turn survey data analysis system.
73
+
74
+ Your task: Determine if the current question is related to previous conversation and what data can be reused.
75
+
76
+ ## CONVERSATION HISTORY
77
+ {conversation_summary}
78
+
79
+ ## PREVIOUSLY RETRIEVED DATA
80
+ {previous_data_summary}
81
+
82
+ ## CURRENT QUESTION
83
+ {current_question}
84
+
85
+ ## ANALYSIS REQUIRED
86
+
87
+ 1. **Is the current question related to the previous conversation?**
88
+ - YES if: Same topic, same questions, same time period (even if different demographic)
89
+ - YES if: Asking for trend/analysis of already-shown data
90
+ - NO if: Completely different topic
91
+ - NO if: Same topic but different time period (e.g., June 2025 β†’ February 2025)
92
+
93
+ 2. **Relation Type** (if related):
94
+ - `same_topic_different_demo`: Same topic/questions, asking for different demographic breakdown
95
+ - `trend_analysis`: Asking for analysis/trends from already-retrieved data
96
+ - `same_topic_different_time`: Same topic but different time period
97
+ - `new_topic`: Completely different topic
98
+
99
+ 3. **Reusable Data**:
100
+ - `questions`: true if same questions can be reused (same topic, same time period)
101
+ - `toplines`: true if overall frequencies already retrieved and still relevant
102
+ - `crosstabs`: true if demographic breakdowns already retrieved and still relevant
103
+
104
+ 4. **Time Period Changed**:
105
+ - true if current question asks about different year/month than previous
106
+ - false if time period is same or not specified
107
+
108
+ Respond with structured output."""
109
+
110
+ def _build_conversation_summary(self, conversation_history: List) -> str:
111
+ """Build a summary of conversation history for the prompt"""
112
+ summary_lines = []
113
+
114
+ for msg in conversation_history:
115
+ if isinstance(msg, HumanMessage):
116
+ summary_lines.append(f"USER: {msg.content}")
117
+ elif isinstance(msg, AIMessage):
118
+ # Truncate long AI responses
119
+ content = msg.content
120
+ if len(content) > 300:
121
+ content = content[:300] + "... (truncated)"
122
+ summary_lines.append(f"ASSISTANT: {content}")
123
+
124
+ return "\n".join(summary_lines) if summary_lines else "No previous conversation"
125
+
126
+ def _build_previous_data_summary(self, previous_stage_results: List) -> str:
127
+ """Build a summary of previously retrieved data"""
128
+ if not previous_stage_results:
129
+ return "No previous data retrieved"
130
+
131
+ summary_lines = []
132
+
133
+ for i, stage_result in enumerate(previous_stage_results, 1):
134
+ summary_lines.append(f"Stage {i}:")
135
+
136
+ # Questionnaire results
137
+ if stage_result.questionnaire_results:
138
+ q_res = stage_result.questionnaire_results
139
+ num_questions = len(q_res.get("source_questions", []))
140
+ question_info = q_res.get("question_info", [])
141
+
142
+ if question_info:
143
+ sample_vars = [q.get("variable_name", "unknown") for q in question_info[:3]]
144
+ sample_vars_str = ", ".join(sample_vars)
145
+ if len(question_info) > 3:
146
+ sample_vars_str += f" ... and {len(question_info) - 3} more"
147
+
148
+ # Extract time period info
149
+ time_info = []
150
+ if question_info[0].get("year"):
151
+ time_info.append(str(question_info[0]["year"]))
152
+ if question_info[0].get("month"):
153
+ time_info.append(question_info[0]["month"])
154
+ time_str = " ".join(time_info) if time_info else "unspecified time"
155
+
156
+ summary_lines.append(f" - Retrieved {num_questions} question(s) from {time_str}")
157
+ summary_lines.append(f" - Variables: {sample_vars_str}")
158
+
159
+ # Toplines results
160
+ if stage_result.toplines_results:
161
+ t_res = stage_result.toplines_results
162
+ num_docs = len(t_res.get("retrieved_docs", []))
163
+ summary_lines.append(f" - Retrieved {num_docs} topline document(s)")
164
+
165
+ # Crosstabs results
166
+ if stage_result.crosstabs_results:
167
+ c_res = stage_result.crosstabs_results
168
+ if "crosstab_docs_by_variable" in c_res:
169
+ num_vars = len(c_res["crosstab_docs_by_variable"])
170
+ summary_lines.append(f" - Retrieved crosstabs for {num_vars} variable(s)")
171
+
172
+ return "\n".join(summary_lines) if summary_lines else "No data summary available"
173
+
174
+ def check_relevance(
175
+ self,
176
+ current_question: str,
177
+ conversation_history: List,
178
+ previous_stage_results: List
179
+ ) -> Dict[str, Any]:
180
+ """
181
+ Check relevance of current question to previous conversation.
182
+
183
+ Args:
184
+ current_question: The current user question
185
+ conversation_history: List of previous messages (HumanMessage, AIMessage)
186
+ previous_stage_results: List of StageResult objects from previous turns
187
+
188
+ Returns:
189
+ Dict with relevance assessment (is_related, relation_type, reusable_data, etc.)
190
+ """
191
+ if self.verbose:
192
+ print("\nπŸ” Checking conversation relevance...")
193
+
194
+ # Build prompt inputs
195
+ conversation_summary = self._build_conversation_summary(conversation_history)
196
+ previous_data_summary = self._build_previous_data_summary(previous_stage_results)
197
+
198
+ # Use simple string replacement instead of .format() to avoid issues with curly braces
199
+ prompt = self.prompt_template.replace("{conversation_summary}", conversation_summary)
200
+ prompt = prompt.replace("{previous_data_summary}", previous_data_summary)
201
+ prompt = prompt.replace("{current_question}", current_question)
202
+
203
+ # Get structured output from LLM
204
+ try:
205
+ relevance_checker = self.llm.with_structured_output(RelevanceResult)
206
+ result = relevance_checker.invoke([
207
+ SystemMessage(content="You are a conversation continuity analyzer for survey data systems."),
208
+ HumanMessage(content=prompt)
209
+ ])
210
+
211
+ if self.verbose:
212
+ print(f" Related: {result.is_related}")
213
+ print(f" Type: {result.relation_type}")
214
+ print(f" Reusable: questions={result.reusable_data.questions}, "
215
+ f"toplines={result.reusable_data.toplines}, "
216
+ f"crosstabs={result.reusable_data.crosstabs}")
217
+ print(f" Time changed: {result.time_period_changed}")
218
+ print(f" Reasoning: {result.reasoning}")
219
+
220
+ return {
221
+ "is_related": result.is_related,
222
+ "relation_type": result.relation_type,
223
+ "reusable_data": {
224
+ "questions": result.reusable_data.questions,
225
+ "toplines": result.reusable_data.toplines,
226
+ "crosstabs": result.reusable_data.crosstabs
227
+ },
228
+ "time_period_changed": result.time_period_changed,
229
+ "reasoning": result.reasoning
230
+ }
231
+
232
+ except Exception as e:
233
+ if self.verbose:
234
+ print(f" ⚠️ Error checking relevance: {e}")
235
+
236
+ # Return safe default (treat as new topic)
237
+ return {
238
+ "is_related": False,
239
+ "relation_type": "new_topic",
240
+ "reusable_data": {
241
+ "questions": False,
242
+ "toplines": False,
243
+ "crosstabs": False
244
+ },
245
+ "time_period_changed": False,
246
+ "reasoning": f"Error during relevance check: {str(e)}"
247
+ }
248
+
survey_agent.py CHANGED
The diff for this file is too large to render. See raw diff
 
toplines_rag.py CHANGED
@@ -1,51 +1,43 @@
1
  """
2
- ToplinesRAG
3
- -----------
4
- Queries the prebuilt Pinecone toplines vectorstore and synthesizes
5
- a natural-language answer with citations using OpenAI.
 
6
  """
7
 
8
  import os
9
- import re
10
-
11
- from pathlib import Path
12
  from typing import Any, Dict, List, Optional
 
 
13
  from dotenv import load_dotenv
14
- from langchain_openai import OpenAIEmbeddings, ChatOpenAI
15
  from langchain_pinecone import PineconeVectorStore
16
  from pinecone import Pinecone
17
- from calendar import month_name
18
 
19
  load_dotenv()
20
 
21
 
22
- def _load_prompt_file(filename: str) -> str:
23
- """Load a prompt file from the prompts directory"""
24
- prompt_dir = Path(__file__).parent / "prompts"
25
- prompt_path = prompt_dir / filename
26
- if not prompt_path.exists():
27
- raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
28
- return prompt_path.read_text(encoding="utf-8")
29
-
30
-
31
  class ToplinesRAG:
 
 
32
  def __init__(
33
  self,
34
- persist_directory: str = "./toplines_vectorstores",
35
  index_name: Optional[str] = None,
36
  llm_model: str = "gpt-4-turbo",
 
37
  ):
38
- self.persist_directory = Path(persist_directory)
39
  self.index_name = index_name or os.getenv("PINECONE_INDEX_NAME_TOPLINES", "toplines-index")
40
  self.namespace = os.getenv("PINECONE_NAMESPACE") or None
 
41
 
42
  self.openai_api_key = os.getenv("OPENAI_API_KEY")
43
  if not self.openai_api_key:
44
  raise ValueError("OPENAI_API_KEY not set")
45
 
46
- pinecone_api_key = os.getenv("PINECONE_API_KEY_TOPLINES")
47
  if not pinecone_api_key:
48
- raise ValueError("PINECONE_API_KEY_TOPLINES not set")
49
 
50
  self.embeddings = OpenAIEmbeddings(
51
  model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
@@ -56,166 +48,145 @@ class ToplinesRAG:
56
  index=self.index, embedding=self.embeddings, namespace=self.namespace
57
  )
58
 
59
- self.llm_model = llm_model
60
- self.llm = ChatOpenAI(
61
- model=self.llm_model,
62
- openai_api_key=self.openai_api_key,
63
- temperature=0
64
- )
65
-
66
- # ----------------------------------------------------------
67
- def _build_filter(self, filters: Dict[str, Any]) -> Optional[Dict]:
68
  """
69
- Build Pinecone filter from filters dict.
70
- Only includes valid metadata fields that exist in the vectorstore.
71
- Ignores unsupported fields like 'topic', 'question_ids', etc.
72
  """
73
- if not filters:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  return None
75
 
76
- # Valid filter fields that exist in toplines metadata
77
- VALID_FILTER_FIELDS = {"year", "month", "poll_date", "survey_name"}
 
 
 
 
 
 
 
 
78
 
79
- # Filter to only include valid fields
 
80
  valid_filters = {k: v for k, v in filters.items()
81
  if k in VALID_FILTER_FIELDS and v is not None}
82
 
83
  if not valid_filters:
84
  return None
85
 
86
- clauses = [{k: {"$eq": str(v)}} for k, v in valid_filters.items()]
87
- return {"$and": clauses} if len(clauses) > 1 else clauses[0]
88
-
89
- # ----------------------------------------------------------
90
- def _extract_filters_from_query(self, query: str) -> Dict[str, str]:
91
- filters = {}
92
- year_match = re.search(r"20\d{2}", query)
93
- if year_match:
94
- filters["year"] = year_match.group()
95
- for i in range(1, 13):
96
- if month_name[i].lower() in query.lower():
97
- filters["month"] = month_name[i]
98
- break
99
- return filters
100
-
101
- # ----------------------------------------------------------
102
- def _synthesize_answer(self, query: str, docs: List[Dict]) -> str:
103
- """Generate a human-readable answer from the retrieved docs."""
104
- if not docs:
105
- # No docs retrieved β†’ truly irrelevant query
106
- return (
107
- "Your query does not match any Vanderbilt Unity Poll data. "
108
- "This system only provides information from those polls."
109
- )
110
-
111
- # Format retrieved documents for context
112
- context_snippets = "\n\n".join(
113
- f"Survey: {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
114
- f"({d.metadata.get('month', '')} {d.metadata.get('year', '')})\n"
115
- f"Question: {d.metadata.get('variable_name', '')}\n"
116
- f"Response: {d.metadata.get('response_label', '')}\n"
117
- f"Pct: {d.metadata.get('pct', 'N/A')}\n"
118
- f"Poll Date: {d.metadata.get('poll_date', 'N/A')}"
119
- for d in docs
120
- )
121
-
122
- # Load prompt from file
123
- prompt_template = _load_prompt_file("toplines_rag_prompt.txt")
124
- prompt = prompt_template.format(
125
- query=query,
126
- context_snippets=context_snippets
127
- )
128
-
129
- completion = self.llm.invoke(prompt)
130
- answer_text = completion.content.strip()
131
-
132
- # Build sources section
133
- sources = [
134
- f"- {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
135
- f"({d.metadata.get('poll_date', 'N/A')}) | Variable: {d.metadata.get('variable_name', 'N/A')}"
136
- for d in docs
137
- ]
138
-
139
- return f"\n--- ANSWER ---\n\n{answer_text}\n\n--- SOURCES ---\n" + "\n".join(sources)
140
-
141
- # ----------------------------------------------------------
142
- def query_toplines(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> str:
143
- pinecone_filter = self._build_filter(filters or {})
144
-
145
- # Try with filters first, but if no results, try without filters to see if data exists
146
- docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
147
-
148
- # If no results with filters but filters were provided, try a broader search
149
- if not docs and pinecone_filter:
150
- # Try without filters to see if the query matches anything
151
- docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
152
- if docs_no_filter:
153
- # Filter results manually by matching metadata
154
- valid_filters = {k: str(v) for k, v in (filters or {}).items()
155
- if k in {"year", "month", "poll_date", "survey_name"} and v}
156
- docs = [
157
- d for d in docs_no_filter
158
- if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
159
- ]
160
- # If still no matches after manual filtering, use the broader results
161
- if not docs:
162
- docs = docs_no_filter[:top_k]
163
 
164
- return self._synthesize_answer(query, docs)
165
 
166
- # ----------------------------------------------------------
167
- def retrieve_raw_data(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> Dict[str, Any]:
 
 
 
 
 
 
168
  """
169
- Retrieve raw data without LLM synthesis.
170
- Used by agent framework to get raw data for synthesis.
 
 
 
 
 
 
 
 
171
 
172
  Returns:
173
- Dict with 'retrieved_docs', 'num_sources', 'filters_applied'
174
  """
175
- pinecone_filter = self._build_filter(filters or {})
 
 
 
 
 
176
 
177
- # Try with filters first, but if no results, try without filters to see if data exists
178
- docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
 
 
 
 
179
 
180
- # If no results with filters but filters were provided, try a broader search
181
- if not docs and pinecone_filter:
182
- # Try without filters to see if the query matches anything
183
- docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
184
- if docs_no_filter:
185
- # Filter results manually by matching metadata
186
- valid_filters = {k: str(v) for k, v in (filters or {}).items()
187
- if k in {"year", "month", "poll_date", "survey_name"} and v}
188
- docs = [
189
- d for d in docs_no_filter
190
- if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
191
- ]
192
- # If still no matches after manual filtering, use the broader results
193
- if not docs:
194
- docs = docs_no_filter[:top_k]
 
 
 
195
 
196
  return {
197
  "retrieved_docs": docs,
198
  "num_sources": len(docs),
199
- "filters_applied": filters or {}
 
 
200
  }
201
 
202
- # ----------------------------------------------------------
203
- def interactive_loop(self):
204
- print("ToplinesRAG ready! Type 'quit' or 'exit' to stop.\n")
205
- while True:
206
- query = input("Enter your poll question: ").strip()
207
- if query.lower() in ("quit", "exit"):
208
- print("Exiting ToplinesRAG. Goodbye!")
209
- break
210
- filters = self._extract_filters_from_query(query)
211
- if filters:
212
- print(f"Using filters: {filters}")
213
- print("\nRetrieving answer...\n")
214
- answer = self.query_toplines(query, filters=filters)
215
- print(answer)
216
- print("\n" + "-"*60 + "\n")
217
-
218
-
219
- if __name__ == "__main__":
220
- rag = ToplinesRAG()
221
- rag.interactive_loop()
 
1
  """
2
+ Toplines RAG Module
3
+ -------------------
4
+ Retrieves topline response frequency data from Pinecone vectorstore.
5
+ Uses question_info for precise metadata filtering.
6
+ Returns raw data only - no synthesis.
7
  """
8
 
9
  import os
 
 
 
10
  from typing import Any, Dict, List, Optional
11
+ from pathlib import Path
12
+
13
  from dotenv import load_dotenv
14
+ from langchain_openai import OpenAIEmbeddings
15
  from langchain_pinecone import PineconeVectorStore
16
  from pinecone import Pinecone
 
17
 
18
  load_dotenv()
19
 
20
 
 
 
 
 
 
 
 
 
 
21
  class ToplinesRAG:
22
+ """Toplines RAG with question_info-based metadata filtering."""
23
+
24
  def __init__(
25
  self,
 
26
  index_name: Optional[str] = None,
27
  llm_model: str = "gpt-4-turbo",
28
+ verbose: bool = False
29
  ):
 
30
  self.index_name = index_name or os.getenv("PINECONE_INDEX_NAME_TOPLINES", "toplines-index")
31
  self.namespace = os.getenv("PINECONE_NAMESPACE") or None
32
+ self.verbose = verbose
33
 
34
  self.openai_api_key = os.getenv("OPENAI_API_KEY")
35
  if not self.openai_api_key:
36
  raise ValueError("OPENAI_API_KEY not set")
37
 
38
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
39
  if not pinecone_api_key:
40
+ raise ValueError("PINECONE_API_KEY not set")
41
 
42
  self.embeddings = OpenAIEmbeddings(
43
  model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
 
48
  index=self.index, embedding=self.embeddings, namespace=self.namespace
49
  )
50
 
51
+ def _build_filter_from_question_info(self, question_info_list: List[Dict[str, Any]]) -> Optional[Dict]:
 
 
 
 
 
 
 
 
52
  """
53
+ Build Pinecone filter from question_info list.
54
+ Matches on variable + year + month combination (no poll_date).
 
55
  """
56
+ if not question_info_list:
57
+ return None
58
+
59
+ # Build filter conditions for each question_info
60
+ filter_clauses = []
61
+ for q_info in question_info_list:
62
+ conditions = []
63
+
64
+ var_name = q_info.get("variable_name")
65
+ if var_name:
66
+ # Match on "variable" field (Pinecone stores short code like "VAND5" in "variable" field)
67
+ # Also check "variable_name" as fallback
68
+ var_conditions = [
69
+ {"variable": {"$eq": var_name}},
70
+ {"variable_name": {"$eq": var_name}}
71
+ ]
72
+ conditions.append({"$or": var_conditions})
73
+
74
+ year = q_info.get("year")
75
+ if year:
76
+ # Pinecone stores year as integer
77
+ conditions.append({"year": {"$eq": int(year)}})
78
+
79
+ month = q_info.get("month")
80
+ if month:
81
+ # Pinecone stores month as string (capitalized like "March", "June")
82
+ # Ensure month is capitalized to match Pinecone format
83
+ month_str = str(month).capitalize()
84
+ conditions.append({"month": {"$eq": month_str}})
85
+
86
+ if conditions:
87
+ # Combine conditions with $and for this question
88
+ if len(conditions) == 1:
89
+ filter_clauses.append(conditions[0])
90
+ else:
91
+ filter_clauses.append({"$and": conditions})
92
+
93
+ if not filter_clauses:
94
  return None
95
 
96
+ # Combine all question filters with $or
97
+ if len(filter_clauses) == 1:
98
+ return filter_clauses[0]
99
+ else:
100
+ return {"$or": filter_clauses}
101
+
102
+ def _build_filter_from_filters(self, filters: Dict[str, Any]) -> Optional[Dict]:
103
+ """Build Pinecone filter from filters dict (for direct queries without question_info)"""
104
+ if not filters:
105
+ return None
106
 
107
+ # Only use year and month (no poll_date)
108
+ VALID_FILTER_FIELDS = {"year", "month", "survey_name"}
109
  valid_filters = {k: v for k, v in filters.items()
110
  if k in VALID_FILTER_FIELDS and v is not None}
111
 
112
  if not valid_filters:
113
  return None
114
 
115
+ clauses = []
116
+ for k, v in valid_filters.items():
117
+ if k == "year":
118
+ # Pinecone stores year as integer
119
+ clauses.append({k: {"$eq": int(v)}})
120
+ elif k == "month":
121
+ # Pinecone stores month as string (capitalized)
122
+ clauses.append({k: {"$eq": str(v).capitalize()}})
123
+ else:
124
+ # survey_name as string
125
+ clauses.append({k: {"$eq": str(v)}})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
+ return {"$and": clauses} if len(clauses) > 1 else clauses[0]
128
 
129
+ def retrieve_raw_data(
130
+ self,
131
+ query: str,
132
+ question_info: Optional[List[Dict[str, Any]]] = None,
133
+ source_questions: Optional[List[Dict[str, Any]]] = None,
134
+ filters: Optional[Dict[str, Any]] = None,
135
+ top_k: int = 10
136
+ ) -> Dict[str, Any]:
137
  """
138
+ Retrieve raw topline data.
139
+ Uses question_info for metadata filtering if provided, otherwise uses filters.
140
+ Falls back to semantic search if metadata filtering returns no results.
141
+
142
+ Args:
143
+ query: User's query (used for semantic search fallback)
144
+ question_info: List of question info dicts with variable_name, year, month, poll_date
145
+ source_questions: Optional list of full question dicts from previous stage (for reference)
146
+ filters: Optional filters dict (used if question_info not provided)
147
+ top_k: Number of results to retrieve
148
 
149
  Returns:
150
+ Dict with 'retrieved_docs', 'num_sources', 'filters_applied', 'source_questions'
151
  """
152
+ if self.verbose:
153
+ print(f"\nπŸ“Š [Toplines] Query: {query}")
154
+ if question_info:
155
+ print(f"πŸ” Question info: {len(question_info)} question(s)")
156
+ if filters:
157
+ print(f"πŸ” Filters: {filters}")
158
 
159
+ # Build filter from question_info (preferred) or filters
160
+ pinecone_filter = None
161
+ if question_info:
162
+ pinecone_filter = self._build_filter_from_question_info(question_info)
163
+ elif filters:
164
+ pinecone_filter = self._build_filter_from_filters(filters)
165
 
166
+ # Try metadata filtering first
167
+ docs = []
168
+ if pinecone_filter:
169
+ if self.verbose:
170
+ print(f"πŸ”§ Using metadata filter: {pinecone_filter}")
171
+ docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
172
+
173
+ if self.verbose:
174
+ print(f"πŸ“₯ Retrieved {len(docs)} documents with metadata filter")
175
+
176
+ # Fallback to semantic search if no results
177
+ if not docs:
178
+ if self.verbose:
179
+ print(f"⚠️ No results with metadata filter, falling back to semantic search")
180
+ docs = self.vector_store.similarity_search(query, k=top_k * 2)
181
+
182
+ if self.verbose:
183
+ print(f"πŸ“₯ Retrieved {len(docs)} documents with semantic search")
184
 
185
  return {
186
  "retrieved_docs": docs,
187
  "num_sources": len(docs),
188
+ "filters_applied": filters or {},
189
+ "question_info_used": question_info or [],
190
+ "source_questions": source_questions or []
191
  }
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
toplines_vectorstores/poll_catalog_toplines.json CHANGED
@@ -1,10 +1,50 @@
1
  {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  "2025-February": {
3
  "file": "toplines_data/Vanderbilt_Unity_Poll_2025_February_toplines.json",
4
  "poll_date": "2025-February",
5
- "num_toplines": 41,
6
  "survey_name": "Vanderbilt Unity Poll",
7
- "year": "2025",
8
  "month": "February"
9
  },
10
  "2025-June": {
@@ -12,7 +52,7 @@
12
  "poll_date": "2025-June",
13
  "num_toplines": 167,
14
  "survey_name": "Vanderbilt Unity Poll",
15
- "year": "2025",
16
  "month": "June"
17
  }
18
  }
 
1
  {
2
+ "2023-June": {
3
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2023_June_toplines.json",
4
+ "poll_date": "2023-June",
5
+ "num_toplines": 82,
6
+ "survey_name": "Vanderbilt Unity Poll",
7
+ "year": 2023,
8
+ "month": "June"
9
+ },
10
+ "2023-March": {
11
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2023_March_toplines.json",
12
+ "poll_date": "2023-March",
13
+ "num_toplines": 40,
14
+ "survey_name": "Vanderbilt Unity Poll",
15
+ "year": 2023,
16
+ "month": "March"
17
+ },
18
+ "2024-March": {
19
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2024_March_toplines.json",
20
+ "poll_date": "2024-March",
21
+ "num_toplines": 58,
22
+ "survey_name": "Vanderbilt Unity Poll",
23
+ "year": 2024,
24
+ "month": "March"
25
+ },
26
+ "2024-October": {
27
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2024_October_toplines.json",
28
+ "poll_date": "2024-October",
29
+ "num_toplines": 69,
30
+ "survey_name": "Vanderbilt Unity Poll",
31
+ "year": 2024,
32
+ "month": "October"
33
+ },
34
+ "2024-September": {
35
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2024_September_toplines.json",
36
+ "poll_date": "2024-September",
37
+ "num_toplines": 80,
38
+ "survey_name": "Vanderbilt Unity Poll",
39
+ "year": 2024,
40
+ "month": "September"
41
+ },
42
  "2025-February": {
43
  "file": "toplines_data/Vanderbilt_Unity_Poll_2025_February_toplines.json",
44
  "poll_date": "2025-February",
45
+ "num_toplines": 95,
46
  "survey_name": "Vanderbilt Unity Poll",
47
+ "year": 2025,
48
  "month": "February"
49
  },
50
  "2025-June": {
 
52
  "poll_date": "2025-June",
53
  "num_toplines": 167,
54
  "survey_name": "Vanderbilt Unity Poll",
55
+ "year": 2025,
56
  "month": "June"
57
  }
58
  }
toplines_vectorstores/toplines_index.json CHANGED
The diff for this file is too large to render. See raw diff