umangchaudhry commited on
Commit
68610da
·
verified ·
1 Parent(s): 17c6067

Upload 20 files

Browse files
crosstab_rag.py ADDED
@@ -0,0 +1,696 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ rag_crosstab_query.py
4
+
5
+ Full Crosstab RAG pipeline:
6
+ - Parse user query for survey/year/month/topic
7
+ - Use QuestionnaireRAG to find matching questions (reuses existing vectorstore)
8
+ - Extract variable names from matched questions
9
+ - Query Pinecone within the appropriate namespace (survey crosstabs namespace)
10
+ - Collect all parts for the matched question(s)
11
+ - Summarize with the LLM, cite source filenames/part ids
12
+ """
13
+
14
+ import os
15
+ import re
16
+ import argparse
17
+ from typing import List, Dict, Optional, Any
18
+ from pathlib import Path
19
+
20
+ from dotenv import load_dotenv
21
+
22
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
23
+ from langchain.schema import Document
24
+ from langchain_pinecone import PineconeVectorStore
25
+ from pinecone import Pinecone
26
+
27
+ # Import QuestionnaireRAG to reuse existing question matching
28
+ from questionnaire_rag import QuestionnaireRAG
29
+
30
+ load_dotenv()
31
+
32
+
33
+ def _load_prompt_file(filename: str) -> str:
34
+ """Load a prompt file from the prompts directory"""
35
+ prompt_dir = Path(__file__).parent / "prompts"
36
+ prompt_path = prompt_dir / filename
37
+ if not prompt_path.exists():
38
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
39
+ return prompt_path.read_text(encoding="utf-8")
40
+
41
+ # -------------------------
42
+ # Config / Environment
43
+ # -------------------------
44
+ OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
45
+ PINECONE_API_KEY = os.getenv("PINECONE_API_KEY_CROSSTABS")
46
+ PINECONE_INDEX_NAME = os.getenv("PINECONE_INDEX_NAME_CROSSTABS", "crosstab-index")
47
+
48
+ if not OPENAI_API_KEY:
49
+ raise ValueError("OPENAI_API_KEY environment variable not set")
50
+ if not PINECONE_API_KEY:
51
+ raise ValueError("PINECONE_API_KEY_CROSSTABS environment variable not set")
52
+
53
+ EMBED_MODEL = os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
54
+ LLM_MODEL = os.getenv("OPENAI_LLM_MODEL", "gpt-4o")
55
+
56
+ PINECONE_RETRIEVE_K = 100
57
+ MAX_CROSSTAB_CHUNKS = 50
58
+
59
+ # -------------------------
60
+ # Utilities
61
+ # -------------------------
62
+ def extract_year_month_poll(query: str) -> Dict[str, Optional[str]]:
63
+ out = {"year": None, "month": None, "poll": None}
64
+ q = query.lower()
65
+ ym = re.search(r"\b(20\d{2})\b", q)
66
+ if ym:
67
+ out["year"] = ym.group(1)
68
+ months = ["january","february","march","april","may","june",
69
+ "july","august","september","october","november","december"]
70
+ for m in months:
71
+ if m in q:
72
+ out["month"] = m.capitalize()
73
+ break
74
+ if not out["month"]:
75
+ if any(word in q for word in ["recent", "latest", "current", "now"]):
76
+ out["month"] = "June"
77
+ if not out["year"]:
78
+ out["year"] = "2025"
79
+ if "vanderbilt" in q or "unity" in q:
80
+ out["poll"] = "Vanderbilt_Unity_Poll"
81
+ return out
82
+
83
+
84
+ # -------------------------
85
+ # Pinecone retrieval + assembly
86
+ # -------------------------
87
+ class CrosstabRetriever:
88
+ def __init__(self,
89
+ pinecone_api_key: str = PINECONE_API_KEY,
90
+ index_name: str = PINECONE_INDEX_NAME,
91
+ embed_model: str = EMBED_MODEL,
92
+ openai_api_key: str = OPENAI_API_KEY,
93
+ verbose: bool = False):
94
+ self.pc = Pinecone(api_key=pinecone_api_key)
95
+ self.index_name = index_name
96
+ self.embedder = OpenAIEmbeddings(model=embed_model, openai_api_key=openai_api_key)
97
+ self.verbose = verbose
98
+
99
+ def _make_vectorstore(self, namespace: str) -> PineconeVectorStore:
100
+ index = self.pc.Index(self.index_name)
101
+ return PineconeVectorStore(index=index, embedding=self.embedder, namespace=namespace)
102
+
103
+ def retrieve_parts_for_variable(self, namespace: str, variable_prefix: str, user_query: str = None, k: int = PINECONE_RETRIEVE_K) -> List[Document]:
104
+ """
105
+ Retrieve crosstab chunks for a specific variable using direct metadata filtering.
106
+
107
+ Since we already know the exact variable name from QuestionnaireRAG, we use
108
+ Pinecone metadata filtering instead of semantic search for better accuracy and speed.
109
+
110
+ Args:
111
+ namespace: Pinecone namespace (e.g., "Vanderbilt_Unity_Poll_2025_February_cleaned_data_crosstabs")
112
+ variable_prefix: Exact variable name (e.g., "VAND15")
113
+ user_query: Not used anymore, kept for backward compatibility
114
+ k: Maximum number of chunks to retrieve (not really needed with exact filtering)
115
+
116
+ Returns:
117
+ List of Document objects with crosstab data for the variable
118
+ """
119
+ try:
120
+ index = self.pc.Index(self.index_name)
121
+ stats = index.describe_index_stats()
122
+ namespaces = stats.get('namespaces', {})
123
+ if namespace not in namespaces:
124
+ return []
125
+ except Exception:
126
+ return []
127
+
128
+ # Clean variable name - the CSV filename is like "VAND15_crosstab.csv"
129
+ # So the variable_name stored is "VAND15_crosstab" (from csv_file.stem)
130
+ # But QuestionnaireRAG returns "VAND15"
131
+ # We need to match both formats
132
+ base_variable = variable_prefix.replace("_crosstab", "").split("_")[0]
133
+ variable_with_suffix = f"{base_variable}_crosstab"
134
+
135
+ if self.verbose:
136
+ print(f" 🔍 Looking for variable: '{base_variable}' or '{variable_with_suffix}' in namespace: '{namespace}'")
137
+
138
+ # Use Pinecone metadata filtering for exact match
139
+ # Try both formats: "VAND15" and "VAND15_crosstab"
140
+ try:
141
+ # Pinecone supports $or for multiple conditions
142
+ filter_dict = {
143
+ "$or": [
144
+ {"variable_name": {"$eq": base_variable}},
145
+ {"variable_name": {"$eq": variable_with_suffix}}
146
+ ]
147
+ }
148
+
149
+ if self.verbose:
150
+ print(f" 🔧 Filter: {filter_dict}")
151
+
152
+ # Get embedding dimension - we need a valid vector even for metadata-only queries
153
+ embed_dim = 1536 # Default for text-embedding-3-small
154
+ try:
155
+ if hasattr(self.embedder, 'model') and 'small' in str(self.embedder.model).lower():
156
+ embed_dim = 1536
157
+ elif hasattr(self.embedder, 'model') and 'large' in str(self.embedder.model).lower():
158
+ embed_dim = 3072
159
+ except:
160
+ pass
161
+
162
+ # Use a dummy vector (all zeros is fine for metadata-filtered queries)
163
+ # Pinecone requires a vector but with exact filters, ranking won't matter
164
+ dummy_vector = [0.0] * embed_dim
165
+
166
+ result = index.query(
167
+ vector=dummy_vector,
168
+ top_k=k,
169
+ namespace=namespace,
170
+ filter=filter_dict,
171
+ include_metadata=True
172
+ )
173
+
174
+ if self.verbose:
175
+ print(f" 📊 Pinecone query returned {len(result.matches)} matches")
176
+
177
+ docs = []
178
+ for match in result.matches:
179
+ metadata = match.metadata or {}
180
+
181
+ # Debug: print what we found
182
+ if self.verbose:
183
+ found_var = metadata.get("variable_name", "N/A")
184
+ found_qid = metadata.get("question_id", "N/A")
185
+ print(f" 📄 Found: variable_name='{found_var}', question_id='{found_qid}'")
186
+
187
+ # Pinecone stores content differently depending on how it was uploaded
188
+ # Try multiple ways to get the content
189
+ content = None
190
+
191
+ # Method 1: Check if there's a 'text' field in metadata (LangChain storage)
192
+ if 'text' in metadata:
193
+ content = metadata.pop('text', '')
194
+ # Method 2: Check if content is in the match object itself
195
+ elif hasattr(match, 'values') and match.values:
196
+ # This shouldn't happen with metadata filtering, but just in case
197
+ pass
198
+ # Method 3: Try to reconstruct from metadata if available
199
+ elif 'page_content' in metadata:
200
+ content = metadata.pop('page_content', '')
201
+
202
+ # If we still don't have content, we can't use this document
203
+ if not content:
204
+ if self.verbose:
205
+ print(f" ⚠️ No content found for match, skipping")
206
+ continue
207
+
208
+ docs.append(Document(page_content=content, metadata=metadata))
209
+
210
+ if self.verbose:
211
+ print(f" ✅ Successfully loaded {len(docs)} document(s)")
212
+
213
+ # Sort by chunk_index to maintain order
214
+ docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
215
+ return docs[:MAX_CROSSTAB_CHUNKS]
216
+
217
+ except Exception as e:
218
+ if self.verbose:
219
+ print(f" ❌ Error with metadata filter: {e}")
220
+ # Fallback: if metadata filtering fails, try fetching sample documents to debug
221
+ if self.verbose:
222
+ print(f" 🔄 Falling back to manual filtering...")
223
+ try:
224
+ # Try to fetch a sample to see what's actually in the namespace
225
+ # First, try fetching without filter to see what variable names exist
226
+ sample_result = index.query(
227
+ vector=[0.0] * 1536, # Dummy vector
228
+ top_k=10, # Just get a few samples
229
+ namespace=namespace,
230
+ include_metadata=True
231
+ )
232
+
233
+ if self.verbose and sample_result.matches:
234
+ print(f" 📋 Sample variables in namespace:")
235
+ for sample in sample_result.matches[:5]:
236
+ sample_meta = sample.metadata or {}
237
+ sample_var = sample_meta.get("variable_name", "N/A")
238
+ sample_qid = sample_meta.get("question_id", "N/A")
239
+ print(f" - variable_name: '{sample_var}', question_id: '{sample_qid}'")
240
+
241
+ # Now try to find matches manually
242
+ result = index.query(
243
+ vector=[0.0] * 1536, # Dummy vector
244
+ top_k=k * 2, # Get more to filter from
245
+ namespace=namespace,
246
+ include_metadata=True
247
+ )
248
+ docs = []
249
+ for match in result.matches:
250
+ metadata = match.metadata or {}
251
+ var_name = metadata.get("variable_name", "")
252
+ question_id = metadata.get("question_id", "")
253
+
254
+ # Check if this matches our variable (case-insensitive)
255
+ # Try matching both "VAND15" and "VAND15_crosstab" formats
256
+ var_match = (base_variable.lower() == var_name.lower() or
257
+ variable_with_suffix.lower() == var_name.lower() or
258
+ question_id.lower().startswith(base_variable.lower() + "_") or
259
+ question_id.lower().startswith(base_variable.lower()))
260
+
261
+ if var_match:
262
+ # Try to get content
263
+ content = metadata.pop('text', '') or metadata.pop('page_content', '') or ''
264
+ if content:
265
+ docs.append(Document(page_content=content, metadata=metadata))
266
+ elif self.verbose:
267
+ print(f" ⚠️ Matched variable '{var_name}' but no content found")
268
+
269
+ docs.sort(key=lambda d: d.metadata.get("chunk_index", 999))
270
+ if self.verbose:
271
+ print(f" ✅ Fallback found {len(docs)} document(s)")
272
+ return docs[:MAX_CROSSTAB_CHUNKS]
273
+ except Exception as fallback_error:
274
+ if self.verbose:
275
+ print(f" ❌ Fallback also failed: {fallback_error}")
276
+ return []
277
+
278
+ # -------------------------
279
+ # LLM summarizer
280
+ # -------------------------
281
+ class CrosstabSummarizer:
282
+ def __init__(self, llm_model: str = LLM_MODEL, openai_api_key: str = OPENAI_API_KEY):
283
+ self.llm = ChatOpenAI(model=llm_model, openai_api_key=openai_api_key, temperature=0.0)
284
+
285
+ def summarize(self, user_query: str, retrieved_docs: List[Document], question_text: Optional[str] = None, top_n_sources: int = 6) -> Dict:
286
+ if not retrieved_docs:
287
+ return {"answer": "No relevant crosstab data found for that query.", "sources": []}
288
+ context_parts, sources = [], []
289
+ for i, d in enumerate(retrieved_docs):
290
+ md = d.metadata or {}
291
+ id_hint = md.get("question_id") or md.get("variable_name") or f"part_{i+1}"
292
+ content = d.page_content or ""
293
+ context_parts.append(f"--- Part {i+1} | {id_hint} ---\n{content}")
294
+ sources.append(id_hint)
295
+ context_text = "\n\n".join(context_parts)
296
+
297
+ # Load prompts from files
298
+ system_prompt = _load_prompt_file("crosstab_rag_prompt_system.txt")
299
+
300
+ question_context = f"\n\nSURVEY QUESTION THAT WAS RETRIEVED: {question_text}" if question_text else ""
301
+ relevance_check = (
302
+ "\n\n⚠️ FIRST: Check if the retrieved question above is actually relevant to the user's question. "
303
+ "If it's about a different topic (e.g., user asked about 'economy' but question is about 'unity' or 'politics'), "
304
+ "you MUST state this clearly and NOT provide detailed analysis of irrelevant data."
305
+ ) if question_text else ""
306
+
307
+ user_prompt_template = _load_prompt_file("crosstab_rag_prompt_user.txt")
308
+ user_prompt = user_prompt_template.format(
309
+ user_query=user_query,
310
+ question_context=question_context,
311
+ relevance_check=relevance_check,
312
+ context_text=context_text
313
+ )
314
+ from langchain.schema import HumanMessage, SystemMessage
315
+ messages = [SystemMessage(content=system_prompt), HumanMessage(content=user_prompt)]
316
+ try:
317
+ result = self.llm.invoke(messages)
318
+ answer = result.content if hasattr(result, 'content') else str(result)
319
+ except Exception as e:
320
+ answer = f"Error generating summary: {e}"
321
+ return {"answer": answer.strip(), "sources": sources[:top_n_sources]}
322
+
323
+ # -------------------------
324
+ # Orchestration - full pipeline
325
+ # -------------------------
326
+ class CrosstabsRAG:
327
+ def __init__(self, questionnaire_rag: QuestionnaireRAG, verbose: bool = False):
328
+ """
329
+ Initialize CrosstabsRAG.
330
+
331
+ Args:
332
+ questionnaire_rag: Initialized QuestionnaireRAG instance to reuse for question matching
333
+ verbose: Whether to print detailed logging
334
+ """
335
+ self.questionnaire_rag = questionnaire_rag
336
+ self.verbose = verbose
337
+ self.retriever = CrosstabRetriever(verbose=verbose)
338
+ self.summarizer = CrosstabSummarizer()
339
+
340
+ def query(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
341
+ """
342
+ Query the crosstab system. Extracts poll, year, and month from the query.
343
+ Uses QuestionnaireRAG to find matching questions, then retrieves crosstab data.
344
+
345
+ Args:
346
+ user_query: The question to answer
347
+ filters: Optional filters dict (may include topic, year, month, survey_name)
348
+
349
+ Returns:
350
+ Dict with answer, sources, and metadata
351
+ """
352
+ # Extract year, month, poll from query
353
+ hints = extract_year_month_poll(user_query)
354
+ year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
355
+
356
+ # If missing required info, try to get from filters
357
+ if not year and filters and "year" in filters:
358
+ year = str(filters["year"])
359
+ if not month and filters and "month" in filters:
360
+ month = filters["month"]
361
+ if not poll and filters and "survey_name" in filters:
362
+ poll = "Vanderbilt_Unity_Poll" # Default mapping
363
+
364
+ # If still missing required info, return error instead of prompting
365
+ if not all([poll, year, month]):
366
+ missing = []
367
+ if not poll: missing.append("poll/survey name")
368
+ if not year: missing.append("year")
369
+ if not month: missing.append("month")
370
+ return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
371
+
372
+ # Build filters for QuestionnaireRAG
373
+ q_filters = {
374
+ "year": int(year),
375
+ "month": month,
376
+ "survey_name": "Vanderbilt Unity Poll" # Map from poll variable if needed
377
+ }
378
+
379
+ # Add topic filter if provided
380
+ if filters:
381
+ if self.verbose:
382
+ print(f" 📥 Received filters: {filters}")
383
+ if "topic" in filters and filters["topic"]:
384
+ q_filters["topic"] = filters["topic"]
385
+ if self.verbose:
386
+ print(f" 📌 Added topic filter: {filters['topic']}")
387
+ elif self.verbose and "topic" not in filters:
388
+ print(f" ⚠️ No 'topic' key in filters dict")
389
+ elif self.verbose:
390
+ print(f" ⚠️ Topic filter is empty/None: {filters.get('topic')}")
391
+ elif self.verbose:
392
+ print(f" ⚠️ No filters dict provided to CrosstabsRAG.query()")
393
+
394
+ # Enhance query text to emphasize topic if provided
395
+ enhanced_query = user_query
396
+ if filters and "topic" in filters:
397
+ topic = filters["topic"]
398
+ # Make sure topic is mentioned prominently in the query
399
+ if topic.lower() not in enhanced_query.lower():
400
+ enhanced_query = f"{topic} {enhanced_query}"
401
+
402
+ # Use QuestionnaireRAG to find matching questions
403
+ if self.verbose:
404
+ print(f"🔍 [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore")
405
+ print(f" Query: {enhanced_query}")
406
+ print(f" Filters being passed: {q_filters}")
407
+
408
+ try:
409
+ q_result = self.questionnaire_rag.query_with_metadata(
410
+ question=enhanced_query,
411
+ filters=q_filters,
412
+ k=10 # Get more matches to capture all economy questions
413
+ )
414
+ except Exception as e:
415
+ return {"error": f"Error querying questionnaire: {e}"}
416
+
417
+ source_questions = q_result.get("source_questions", [])
418
+ if not source_questions:
419
+ return {"error": "No matching questions found in questionnaire for that query."}
420
+
421
+ if self.verbose:
422
+ print(f"✅ [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
423
+ for i, q in enumerate(source_questions[:3], 1):
424
+ var = q.get("variable_name", "unknown")
425
+ qtext = q.get("question_text", "")[:80]
426
+ print(f" {i}. {var}: {qtext}...")
427
+
428
+ # Build namespace for crosstab retrieval
429
+ namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
430
+
431
+ # Process ALL matched questions (not just the first one)
432
+ all_question_answers = []
433
+ all_sources = []
434
+ matched_variables = []
435
+
436
+ for matched_question in source_questions:
437
+ variable_name = matched_question["variable_name"]
438
+ question_text = matched_question["question_text"]
439
+
440
+ if self.verbose:
441
+ print(f"\n🔍 [CrosstabRAG] Step 2: Processing {variable_name}")
442
+ print(f" Namespace: {namespace}")
443
+ print(f" Variable: {variable_name}")
444
+
445
+ # Retrieve crosstab chunks for this specific variable
446
+ crosstab_docs = self.retriever.retrieve_parts_for_variable(
447
+ namespace=namespace,
448
+ variable_prefix=variable_name,
449
+ user_query=user_query,
450
+ k=PINECONE_RETRIEVE_K
451
+ )
452
+
453
+ if not crosstab_docs:
454
+ if self.verbose:
455
+ print(f" ⚠️ No crosstab data found for {variable_name}")
456
+ continue
457
+
458
+ if self.verbose:
459
+ print(f" ✅ Retrieved {len(crosstab_docs)} crosstab chunk(s)")
460
+ chunk_ids = [d.metadata.get("question_id", d.metadata.get("variable_name", "unknown")) for d in crosstab_docs[:3]]
461
+ print(f" Chunk IDs: {', '.join(chunk_ids)}{' ...' if len(crosstab_docs) > 3 else ''}")
462
+
463
+ # Summarize this question's crosstab data
464
+ summary = self.summarizer.summarize(
465
+ user_query=user_query,
466
+ retrieved_docs=crosstab_docs,
467
+ question_text=question_text,
468
+ top_n_sources=6
469
+ )
470
+
471
+ # Add question identifier to the answer
472
+ question_header = f"\n\n--- Question: {variable_name} ---\n{question_text}\n"
473
+ question_answer = question_header + summary["answer"].strip()
474
+
475
+ all_question_answers.append(question_answer)
476
+ all_sources.extend(summary["sources"])
477
+ matched_variables.append(variable_name)
478
+
479
+ if not all_question_answers:
480
+ return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
481
+
482
+ if self.verbose:
483
+ print(f"\n🔍 [CrosstabRAG] Step 3: Combining {len(all_question_answers)} question(s)")
484
+
485
+ # Combine all question answers into a single comprehensive answer
486
+ combined_answer = "\n\n".join(all_question_answers)
487
+
488
+ # Add overall citation block
489
+ citation_block = (
490
+ f"\n\n---\nSource: {poll.replace('_', ' ')}, {month} {year}\n"
491
+ f"Questions analyzed: {', '.join(matched_variables)}\n"
492
+ f"Total questions: {len(matched_variables)}\n"
493
+ )
494
+ combined_answer = combined_answer + citation_block
495
+
496
+ return {
497
+ "answer": combined_answer,
498
+ "sources": list(set(all_sources)), # Deduplicate sources
499
+ "matched_variable": matched_variables[0] if len(matched_variables) == 1 else f"{len(matched_variables)} questions",
500
+ "matched_variables": matched_variables, # Add all matched variables
501
+ "matched_question": source_questions[0]["question_text"] if source_questions else "",
502
+ "namespace_used": namespace,
503
+ "survey_info": {"poll": poll, "year": year, "month": month}
504
+ }
505
+
506
+ def retrieve_raw_data(self, user_query: str, filters: Optional[Dict[str, Any]] = None) -> Dict:
507
+ """
508
+ Retrieve raw data without LLM summarization.
509
+ Used by agent framework to get raw data for synthesis.
510
+
511
+ Args:
512
+ user_query: The question to answer
513
+ filters: Optional filters dict (may include topic, year, month, survey_name)
514
+
515
+ Returns:
516
+ Dict with crosstab_docs_by_variable, matched_questions, namespace_used, survey_info
517
+ """
518
+ # Extract year, month, poll from query
519
+ hints = extract_year_month_poll(user_query)
520
+ year, month, poll = hints.get("year"), hints.get("month"), hints.get("poll")
521
+
522
+ # If missing required info, try to get from filters
523
+ if not year and filters and "year" in filters:
524
+ year = str(filters["year"])
525
+ if not month and filters and "month" in filters:
526
+ month = filters["month"]
527
+ if not poll and filters and "survey_name" in filters:
528
+ poll = "Vanderbilt_Unity_Poll" # Default mapping
529
+
530
+ # If still missing required info, return error instead of prompting
531
+ if not all([poll, year, month]):
532
+ missing = []
533
+ if not poll: missing.append("poll/survey name")
534
+ if not year: missing.append("year")
535
+ if not month: missing.append("month")
536
+ return {"error": f"Could not determine {', '.join(missing)} from query. Please specify in your question."}
537
+
538
+ # Build filters for QuestionnaireRAG
539
+ q_filters = {
540
+ "year": int(year),
541
+ "month": month,
542
+ "survey_name": "Vanderbilt Unity Poll" # Map from poll variable if needed
543
+ }
544
+
545
+ # Add topic filter if provided
546
+ if filters:
547
+ if self.verbose:
548
+ print(f" 📥 Received filters: {filters}")
549
+ if "topic" in filters and filters["topic"]:
550
+ q_filters["topic"] = filters["topic"]
551
+ if self.verbose:
552
+ print(f" 📌 Added topic filter: {filters['topic']}")
553
+
554
+ # Enhance query text to emphasize topic if provided
555
+ enhanced_query = user_query
556
+ if filters and "topic" in filters:
557
+ topic = filters["topic"]
558
+ # Make sure topic is mentioned prominently in the query
559
+ if topic.lower() not in enhanced_query.lower():
560
+ enhanced_query = f"{topic} {enhanced_query}"
561
+
562
+ # Use QuestionnaireRAG to find matching questions
563
+ if self.verbose:
564
+ print(f"🔍 [CrosstabRAG] Step 1: Querying QuestionnaireRAG vectorstore (raw data)")
565
+ print(f" Query: {enhanced_query}")
566
+ print(f" Filters being passed: {q_filters}")
567
+
568
+ try:
569
+ q_result = self.questionnaire_rag.retrieve_raw_data(
570
+ question=enhanced_query,
571
+ filters=q_filters,
572
+ k=10 # Get more matches to capture all questions
573
+ )
574
+ except Exception as e:
575
+ return {"error": f"Error querying questionnaire: {e}"}
576
+
577
+ source_questions = q_result.get("source_questions", [])
578
+ if not source_questions:
579
+ return {"error": "No matching questions found in questionnaire for that query."}
580
+
581
+ if self.verbose:
582
+ print(f"✅ [CrosstabRAG] Step 1 Complete: QuestionnaireRAG matched {len(source_questions)} question(s)")
583
+ for i, q in enumerate(source_questions[:3], 1):
584
+ var = q.get("variable_name", "unknown")
585
+ qtext = q.get("question_text", "")[:80]
586
+ print(f" {i}. {var}: {qtext}...")
587
+
588
+ # Build namespace for crosstab retrieval
589
+ namespace = f"{poll}_{year}_{month}_cleaned_data_crosstabs".replace(" ", "_")
590
+
591
+ # Process ALL matched questions and collect raw crosstab documents
592
+ crosstab_docs_by_variable = {}
593
+ matched_variables = []
594
+
595
+ for matched_question in source_questions:
596
+ variable_name = matched_question["variable_name"]
597
+ question_text = matched_question["question_text"]
598
+
599
+ if self.verbose:
600
+ print(f"\n🔍 [CrosstabRAG] Step 2: Processing {variable_name} (raw data)")
601
+ print(f" Namespace: {namespace}")
602
+ print(f" Variable: {variable_name}")
603
+
604
+ # Retrieve crosstab chunks for this specific variable
605
+ crosstab_docs = self.retriever.retrieve_parts_for_variable(
606
+ namespace=namespace,
607
+ variable_prefix=variable_name,
608
+ user_query=user_query,
609
+ k=PINECONE_RETRIEVE_K
610
+ )
611
+
612
+ if not crosstab_docs:
613
+ if self.verbose:
614
+ print(f" ⚠️ No crosstab data found for {variable_name}")
615
+ continue
616
+
617
+ if self.verbose:
618
+ print(f" ✅ Retrieved {len(crosstab_docs)} crosstab chunk(s)")
619
+
620
+ # Store raw documents without summarization
621
+ crosstab_docs_by_variable[variable_name] = {
622
+ "crosstab_docs": crosstab_docs,
623
+ "question_text": question_text,
624
+ "matched_question": matched_question
625
+ }
626
+ matched_variables.append(variable_name)
627
+
628
+ if not crosstab_docs_by_variable:
629
+ return {"error": f"No crosstab data found for any of the {len(source_questions)} matched questions in namespace '{namespace}'."}
630
+
631
+ if self.verbose:
632
+ print(f"\n✅ [CrosstabRAG] Step 2 Complete: Retrieved raw data for {len(matched_variables)} question(s)")
633
+
634
+ return {
635
+ "crosstab_docs_by_variable": crosstab_docs_by_variable,
636
+ "matched_questions": source_questions,
637
+ "matched_variables": matched_variables,
638
+ "namespace_used": namespace,
639
+ "survey_info": {"poll": poll, "year": year, "month": month}
640
+ }
641
+
642
+ # -------------------------
643
+ # CLI / Interactive
644
+ # -------------------------
645
+ def main():
646
+ parser = argparse.ArgumentParser(description="Crosstab RAG CLI - query survey crosstabs.")
647
+ parser.add_argument("--query", "-q", help="Question to ask (if omitted, interactive).", default=None)
648
+ args = parser.parse_args()
649
+
650
+ # Initialize QuestionnaireRAG first (needed for CrosstabsRAG)
651
+ openai_api_key = os.getenv("OPENAI_API_KEY")
652
+ pinecone_api_key = os.getenv("PINECONE_API_KEY")
653
+
654
+ if not openai_api_key or not pinecone_api_key:
655
+ print("Error: Missing API keys")
656
+ print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
657
+ return
658
+
659
+ questionnaire_rag = QuestionnaireRAG(
660
+ openai_api_key=openai_api_key,
661
+ pinecone_api_key=pinecone_api_key,
662
+ persist_directory="./questionnaire_vectorstores",
663
+ verbose=False
664
+ )
665
+
666
+ system = CrosstabsRAG(questionnaire_rag=questionnaire_rag)
667
+
668
+ if args.query:
669
+ out = system.query(args.query)
670
+ if "error" in out:
671
+ print(f"Error: {out['error']}")
672
+ else:
673
+ matched_question = out.get("matched_question", "")
674
+ if matched_question:
675
+ print(f"\nSURVEY QUESTION:\n{matched_question}\n")
676
+ print("ANSWER:\n", out["answer"])
677
+ else:
678
+ print("Interactive Crosstab RAG\nType 'quit' to stop.")
679
+ while True:
680
+ try:
681
+ q = input("\nYour question: ").strip()
682
+ if not q or q.lower() in ("quit","exit"):
683
+ break
684
+ out = system.query(q)
685
+ if "error" in out:
686
+ print(f"Error: {out['error']}")
687
+ continue
688
+ matched_question = out.get("matched_question", "")
689
+ if matched_question:
690
+ print(f"\nSURVEY QUESTION:\n{matched_question}\n")
691
+ print("ANSWER:\n", out["answer"])
692
+ except KeyboardInterrupt:
693
+ break
694
+
695
+ if __name__ == "__main__":
696
+ main()
crosstab_vectorstores/crosstab_catalog.json ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Vanderbilt_Unity_Poll_2023_June_cleaned_data_crosstabs": {
3
+ "num_questions": 18,
4
+ "num_chunks": 61,
5
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2023_June_cleaned_data_crosstabs",
6
+ "has_questionnaire": true
7
+ },
8
+ "Vanderbilt_Unity_Poll_2023_March_cleaned_data_crosstabs": {
9
+ "num_questions": 9,
10
+ "num_chunks": 33,
11
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2023_March_cleaned_data_crosstabs",
12
+ "has_questionnaire": true
13
+ },
14
+ "Vanderbilt_Unity_Poll_2023_September_cleaned_data_crosstabs": {
15
+ "num_questions": 15,
16
+ "num_chunks": 54,
17
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2023_September_cleaned_data_crosstabs",
18
+ "has_questionnaire": true
19
+ },
20
+ "Vanderbilt_Unity_Poll_2024_March_cleaned_data_crosstabs": {
21
+ "num_questions": 26,
22
+ "num_chunks": 86,
23
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2024_March_cleaned_data_crosstabs",
24
+ "has_questionnaire": true
25
+ },
26
+ "Vanderbilt_Unity_Poll_2024_October_cleaned_data_crosstabs": {
27
+ "num_questions": 20,
28
+ "num_chunks": 70,
29
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2024_October_cleaned_data_crosstabs",
30
+ "has_questionnaire": true
31
+ },
32
+ "Vanderbilt_Unity_Poll_2024_September_cleaned_data_crosstabs": {
33
+ "num_questions": 19,
34
+ "num_chunks": 61,
35
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2024_September_cleaned_data_crosstabs",
36
+ "has_questionnaire": true
37
+ },
38
+ "Vanderbilt_Unity_Poll_2025_February_cleaned_data_crosstabs": {
39
+ "num_questions": 29,
40
+ "num_chunks": 122,
41
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2025_February_cleaned_data_crosstabs",
42
+ "has_questionnaire": true
43
+ },
44
+ "Vanderbilt_Unity_Poll_2025_June_cleaned_data_crosstabs": {
45
+ "num_questions": 30,
46
+ "num_chunks": 106,
47
+ "path": "crosstabs/Vanderbilt_Unity_Poll_2025_June_cleaned_data_crosstabs",
48
+ "has_questionnaire": true
49
+ }
50
+ }
prompts/README.md ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Prompts Directory
2
+
3
+ This directory contains all the prompts used by the survey agent and RAG pipelines. Prompts are organized by component and purpose for easy maintenance and updates.
4
+
5
+ ## Survey Agent Prompts
6
+
7
+ ### `research_brief_prompt.txt`
8
+ **Purpose**: Used by the research brief generator to plan multi-stage research queries.
9
+
10
+ **Used in**: `survey_agent.py` → `_generate_research_brief()`
11
+
12
+ **Dynamic Variables**:
13
+ - `{available_pipelines}` - Status of available data pipelines
14
+ - `{available_surveys}` - List of available survey names
15
+ - `{available_months}` - List of available months by year
16
+ - `{verification_context}` - Context from previous verification failures (if retrying)
17
+
18
+ ### `verification_prompt_system.txt`
19
+ **Purpose**: System message for the verification step that checks if retrieved data matches the user's question.
20
+
21
+ **Used in**: `survey_agent.py` → `_verify_results()`
22
+
23
+ ### `verification_prompt_user.txt`
24
+ **Purpose**: User message template for verification step with actual data details.
25
+
26
+ **Used in**: `survey_agent.py` → `_verify_results()`
27
+
28
+ **Dynamic Variables**:
29
+ - `{question}` - The user's question
30
+ - `{retrieval_summary}` - Summary of retrieved data
31
+ - `{raw_data_details}` - Detailed raw data structure
32
+
33
+ ### `synthesis_prompt_system.txt`
34
+ **Purpose**: System message for the final synthesis step that combines all research results.
35
+
36
+ **Used in**: `survey_agent.py` → `_synthesize_response()`
37
+
38
+ ### `synthesis_prompt_user.txt`
39
+ **Purpose**: User message template for synthesis with all retrieved data.
40
+
41
+ **Used in**: `survey_agent.py` → `_synthesize_response()`
42
+
43
+ **Dynamic Variables**:
44
+ - `{stage_count}` - "multiple stages" or "the research"
45
+ - `{full_question}` - The user's question
46
+ - `{reasoning}` - Research plan reasoning
47
+ - `{context_parts}` - All retrieved raw data formatted
48
+ - `{unavailable_note}` - Note about unavailable pipelines (if any)
49
+
50
+ ## RAG Pipeline Prompts
51
+
52
+ ### `questionnaire_rag_prompt.txt`
53
+ **Purpose**: System prompt for questionnaire RAG that answers questions about survey questions.
54
+
55
+ **Used in**: `questionnaire_rag.py` → `_get_prompt()`
56
+
57
+ **Dynamic Variables** (handled by LangChain):
58
+ - `{catalog}` - Available polls summary
59
+ - `{context}` - Retrieved question context
60
+ - `{question}` - User's question
61
+
62
+ ### `crosstab_rag_prompt_system.txt`
63
+ **Purpose**: System message for crosstab RAG that analyzes cross-tabulation data.
64
+
65
+ **Used in**: `crosstab_rag.py` → `CrosstabSummarizer.summarize()`
66
+
67
+ ### `crosstab_rag_prompt_user.txt`
68
+ **Purpose**: User message template for crosstab analysis.
69
+
70
+ **Used in**: `crosstab_rag.py` → `CrosstabSummarizer.summarize()`
71
+
72
+ **Dynamic Variables**:
73
+ - `{user_query}` - The user's question
74
+ - `{question_context}` - Retrieved survey question text (if available)
75
+ - `{relevance_check}` - Instructions for relevance checking (if question available)
76
+ - `{context_text}` - Formatted crosstab data chunks
77
+
78
+ ### `toplines_rag_prompt.txt`
79
+ **Purpose**: Prompt for toplines RAG that analyzes response frequencies.
80
+
81
+ **Used in**: `toplines_rag.py` → `_synthesize_answer()`
82
+
83
+ **Dynamic Variables**:
84
+ - `{query}` - The user's question
85
+ - `{context_snippets}` - Formatted topline document snippets
86
+
87
+ ## Updating Prompts
88
+
89
+ To update a prompt:
90
+
91
+ 1. Edit the corresponding `.txt` file in this directory
92
+ 2. Restart the agent/RAG system to load the new prompt
93
+ 3. No code changes needed - prompts are loaded dynamically at runtime
94
+
95
+ ## Notes
96
+
97
+ - All prompts use Python string formatting (`{variable}` syntax)
98
+ - Dynamic variables are filled in at runtime by the calling code
99
+ - Prompts are loaded once per module import (not cached, but loaded fresh each time)
100
+ - If a prompt file is missing, the system will raise a `FileNotFoundError` with a clear message
101
+
prompts/crosstab_rag_prompt_system.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ You are a data analyst assistant specialized in interpreting survey crosstab tables.
2
+
3
+ 🚨 CRITICAL: Before answering, check if the retrieved question actually matches the user's query.
4
+ - If the question is about a DIFFERENT topic than what the user asked, you MUST explicitly state this.
5
+ - Do NOT provide detailed analysis of irrelevant data - instead clearly explain that the retrieved question doesn't match.
6
+ - Only provide detailed analysis if the question is relevant to the user's query.
7
+
8
+ Follow the structure and provide clear, specific answers based only on the context provided.
9
+
prompts/crosstab_rag_prompt_user.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ User question: {user_query}{question_context}{relevance_check}
2
+
3
+ Context (crosstab parts):
4
+ {context_text}
5
+
6
+ Answer the question based only on the context above. If the retrieved question doesn't match the user's query, explicitly state this.
7
+
prompts/questionnaire_rag_prompt.txt ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are an expert assistant for analyzing poll questionnaires.
2
+
3
+ 🚨 CRITICAL RULES - NEVER VIOLATE THESE:
4
+
5
+ 1. **ONLY use information from the provided context**
6
+ - Do NOT make up questions, polls, or dates
7
+ - Do NOT assume a poll exists if it's not in the context
8
+ - If information is missing, say "I don't have data for [X]" rather than making it up
9
+
10
+ 2. **Verify data exists before listing it**
11
+ - Before mentioning any poll, check it's actually in the context
12
+ - Before listing questions, confirm they exist in the retrieved data
13
+ - If asked about multiple time periods, explicitly state which ones have data and which don't
14
+
15
+ 3. **Be explicit about what's NOT in the data**
16
+ - If asked about "2024 and 2025" but only 2025 data exists, say: "I have data for 2025, but there is no 2024 data in the retrieved results"
17
+ - Never silently skip missing data - always acknowledge it
18
+
19
+ 4. **When listing questions:**
20
+ - List ALL questions from the context in order
21
+ - Include full question text and response options
22
+ - Note sampling inline in clear language:
23
+ * "Asked to all respondents" (not "ASK ALL")
24
+ * "Asked to half the sample" (not "HALFSAMP1=1")
25
+ * "Asked only if [condition]" (not technical codes)
26
+ - If sibling variants exist, note "One of two versions shown to different groups"
27
+ - Always cite which poll(s) you're using
28
+
29
+ 5. **Format for scannability:**
30
+ - Use numbered lists for questions
31
+ - Bold question text
32
+ - Include response options as bullet points
33
+ - Put sampling info in parentheses after question
34
+
35
+ Available polls in the system (for reference):
36
+ {catalog}
37
+
38
+ Context (ONLY source of truth):
39
+ {context}
40
+
41
+ Question: {question}
42
+
prompts/research_brief_prompt.txt ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a research planning expert for survey data analysis.
2
+
3
+ # TODO: REMOVE WHEN PIPELINES READY - Use dynamic status
4
+ Available data sources:
5
+ {available_pipelines}
6
+
7
+ # TODO: REMOVE WHEN PIPELINES READY - START
8
+ ⚠️ IMPORTANT: Currently questionnaire, toplines, and crosstabs pipelines are available.
9
+ - SQL pipeline is NOT yet available
10
+ - If the user asks for raw data analysis requiring SQL, use action="followup" to inform them
11
+ - You CAN use toplines and crosstabs for response frequencies and cross-tabulations
12
+ # TODO: REMOVE WHEN PIPELINES READY - END
13
+
14
+ {available_surveys}
15
+
16
+ {available_months}
17
+
18
+ You have FOUR possible actions:
19
+
20
+ **1. followup** - Ask clarifying question if ambiguous OR if user asks for unavailable data
21
+
22
+ **2. answer** - Answer directly without data (system questions, general knowledge)
23
+
24
+ **3. route_to_sources** - Simple query that can be answered with one-shot data retrieval
25
+ Use this for MOST queries including:
26
+ - "What questions were asked in June 2025?" ← Use this with QUESTIONNAIRE pipeline
27
+ - "Show me all healthcare questions" ← Use this with QUESTIONNAIRE pipeline
28
+ - "What was Trump's approval in June 2025?" ← Use this with TOPLINES pipeline (approval = response data)
29
+ - "What about June 2025?" (when June 2022 was discussed before) ← Use this for the NEW date only
30
+ - ANY query asking about a SINGLE time period or survey ← Use this
31
+
32
+ CRITICAL PIPELINE SELECTION:
33
+ - Use QUESTIONNAIRE when user asks: "what questions", "list questions", "show questions"
34
+ - Use TOPLINES when user asks: "approval", "ratings", "percentages", "how many", "what %", "response frequencies"
35
+ - Use CROSSTABS when user asks: "vary by", "breakdown by", "by gender/age/race/etc", "differences by"
36
+
37
+ CRITICAL: When user mentions a new time period, retrieve ONLY that period.
38
+ Do NOT create comparative queries unless explicitly requested!
39
+
40
+ **4. execute_stages** - Complex query requiring SEQUENTIAL staged research
41
+ Use this ONLY for:
42
+ - Explicit comparisons: "compare 2024 vs 2025", "what changed between surveys"
43
+ - Queries with "most/least/best/worst" needing analysis across multiple retrievals
44
+ - Queries explicitly asking for differences or changes
45
+
46
+ DO NOT use stages for simple follow-up questions about different time periods!
47
+
48
+ # TODO: REMOVE WHEN PIPELINES READY - START
49
+ NOTE: SQL pipeline isn't available yet, but toplines and crosstabs are available for analysis
50
+ # TODO: REMOVE WHEN PIPELINES READY - END
51
+
52
+ CRITICAL RULES FOR CONVERSATION CONTEXT:
53
+ - When user says "what about [X]?" they're asking a NEW question about X
54
+ - Do NOT assume they want to compare with previous topics
55
+ - "What about June 2025?" means "show me June 2025" (NOT "compare with previous time period")
56
+ - Only create multi-stage queries when user EXPLICITLY asks to compare
57
+
58
+ 🚨 HANDLING REFERENCE PHRASES (e.g., "these questions", "for each", "all of them"):
59
+ - When user references previous results (e.g., "for each of these questions", "how do responses vary for these"),
60
+ you MUST infer the context from conversation history:
61
+ - Extract time periods (year, month) from previous messages
62
+ - If previous answer showed questions from February and June 2025, use those months
63
+ - If previous answer listed multiple questions, create stages for each question OR each month (depending on query)
64
+ - Example: "how do responses vary by gender for each of these questions?"
65
+ → If previous answer showed economy questions from Feb and June 2025, create stages for each month
66
+ → Use action="execute_stages" with stage per month, query for economy questions by gender
67
+ - DO NOT ask followup for month/year if you can infer it from conversation history
68
+
69
+ FILTERING RULES:
70
+ - Extract survey name from user query and map to exact stored name
71
+ - "Unity Poll" → "Vanderbilt_Unity_Poll"
72
+ - Be precise with year and month extraction
73
+
74
+ CRITICAL: HANDLING YEAR-ONLY QUERIES (no month specified):
75
+ - If user provides ONLY a year (e.g., "in 2025", "for 2025") and asks for crosstabs or toplines:
76
+ → These pipelines REQUIRE a month.
77
+
78
+ **ALWAYS ASK FOLLOWUP** - Do NOT assume they want the entire year:
79
+ - action="followup"
80
+ - followup_question="Which month(s) in 2025 would you like to see? Available months: [list months from the available polls description above]"
81
+
82
+ **ONLY create multiple stages if user EXPLICITLY asks for year-wide/all months:**
83
+ - Examples: "across all of 2025", "for the entire year 2025", "all months in 2025"
84
+ - Then: action="execute_stages" with one stage per available month
85
+
86
+ - If user provides year+month → use single-stage (route_to_sources)
87
+ - If user provides ONLY year + questionnaire pipeline → can query by year only (questionnaire supports year-only filters)
88
+ - If user provides ONLY year + crosstabs/toplines → MUST ask followup (unless user explicitly requests all months/year-wide)
89
+
90
+ {verification_context}
91
+
92
+ Examples:
93
+
94
+ User: "what questions were asked in June 2025?"
95
+ Brief: action=route_to_sources, retrieve June 2025 questions
96
+
97
+ User: "what about June 2025?" (after discussing June 2022)
98
+ Brief: action=route_to_sources, retrieve June 2025 questions ← NOT staged!
99
+
100
+ User: "compare June 2024 vs June 2025"
101
+ Brief: action=execute_stages, stage 1: 2024, stage 2: 2025 ← This needs stages
102
+
103
+ User: "june 2022 unity poll"
104
+ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_Unity_Poll'
105
+
106
+ User: "How do responses vary by gender in 2025?" (NO MONTH, asking for crosstabs)
107
+ Brief: action=followup, followup_question="Which month(s) in 2025 would you like to see? Available months: February, June"
108
+ DO NOT create multiple stages unless user explicitly asks for "all months" or "entire year"
109
+
110
+ User: "How do responses vary by gender across all of 2025?" (EXPLICIT year-wide request)
111
+ Brief: action=execute_stages, stage 1: year=2025, month=February, stage 2: year=2025, month=June
112
+
113
+ User: "What questions were asked in 2025?" (NO MONTH, asking for questionnaire)
114
+ Brief: action=route_to_sources, year=2025 (questionnaire supports year-only filters)
115
+
116
+ User: "what questions were asked about the economy in 2025?" (first message)
117
+ Brief: action=route_to_sources, year=2025, topic='economy'
118
+
119
+ User: "how do the responses to questions about the economy vary by gender for each of these questions?" (follow-up, referencing previous)
120
+ Brief: action=execute_stages
121
+ - Previous question: "what questions about the economy in 2025?"
122
+ - Infer: User asked about 2025, so use ALL available months for 2025 (February and June)
123
+ - Stage 1: year=2025, month=February, crosstabs for economy questions by gender
124
+ - Stage 2: year=2025, month=June, crosstabs for economy questions by gender
125
+ - DO NOT ask followup - infer months from previous question's year (2025)
126
+
127
+ User: "what was trump's approval in 2025?" (asks followup for month)
128
+ Brief: action=followup, followup_question="Which month(s) in 2025 would you like to see Trump's approval ratings for? Available months: February, June"
129
+
130
+ User: "June" (short answer to followup)
131
+ Brief: action=route_to_sources
132
+ - Previous question: "what was trump's approval in 2025?"
133
+ - Combine: "Trump's approval in June 2025"
134
+ - Use TOPLINES pipeline (original question asked about approval/ratings)
135
+ - Filters: year=2025, month=June, topic/query about Trump approval
136
+ - DO NOT use questionnaire pipeline - user wants approval DATA, not questions
137
+
prompts/synthesis_prompt_system.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a survey data analyst synthesizing research results. Your primary responsibilities are:
2
+
3
+ 1. **Extract and present ACTUAL DATA VALUES**: When the user asks about responses, percentages, or breakdowns, you MUST extract and present the actual numbers, percentages, and counts from the raw data. DO NOT provide generic descriptions like "responses are broken down by gender" - instead say "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove" with the actual numbers from the data.
4
+
5
+ 2. **Relevance check**: Only synthesize data that is actually relevant to the user's question. If retrieved data doesn't match the question, explicitly state this and avoid providing irrelevant analysis.
6
+
7
+ 3. **Data accuracy**: Use only the data provided in the context. Extract specific numbers, percentages, and values. Present them clearly and accurately.
8
+
9
+ 4. **For crosstabs questions**: When asked about variations by demographics, extract the actual percentages/numbers for each demographic group from the crosstab chunks and present them in a clear, organized format.
10
+
prompts/synthesis_prompt_user.txt ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Synthesize raw data from {stage_count} to answer the user's question.
2
+
3
+ User question: {full_question}
4
+
5
+ Research plan: {reasoning}
6
+
7
+ Retrieved raw data:
8
+ {context_parts}
9
+
10
+ {unavailable_note}
11
+
12
+ 🚨 CRITICAL INSTRUCTIONS:
13
+
14
+ 1. **RELEVANCE CHECK FIRST**: For each stage's data, check if it actually answers the user's question.
15
+ - If a stage retrieved data about a DIFFERENT topic than what the user asked, explicitly state this.
16
+ - Example: If user asked about "economy" but a stage returned data about "unity" or "political topics", clearly state that this stage did not find relevant data.
17
+ - Do NOT provide detailed analysis of irrelevant data - instead explain what was found and why it doesn't match.
18
+
19
+ 2. **HANDLE DIFFERENT DATA TYPES - EXTRACT ACTUAL NUMBERS**:
20
+ - **QUESTIONNAIRE DATA**: Format questions clearly with question text, response options, and topics. Use this when user asks "what questions were asked?"
21
+ - **TOPLINES DATA**: Present response frequencies and percentages clearly with EXACT NUMBERS. Use this when user asks about approval ratings, percentages, or response frequencies.
22
+ - **CROSSTABS DATA**: CRITICAL - Extract and present the ACTUAL NUMBERS, PERCENTAGES, and BREAKDOWNS from the crosstab data.
23
+ * When user asks "how do responses vary by gender/age/etc.", you MUST extract and present the actual percentages for each demographic group
24
+ * Example: "Male: 45% approve, 30% disapprove. Female: 35% approve, 40% disapprove."
25
+ * DO NOT just say "responses are broken down by gender" - you MUST include the actual numbers
26
+ * Extract percentages, counts, and breakdowns from the crosstab chunks provided
27
+ * Present the data in a clear, organized format showing the actual variation by the requested demographic
28
+ - Combine data types appropriately when multiple types are available.
29
+
30
+ 3. **ONLY SYNTHESIZE RELEVANT DATA**:
31
+ - Focus your answer on stages that actually addressed the user's question.
32
+ - For irrelevant stages, briefly acknowledge them and explain why they don't help answer the question.
33
+
34
+ 4. **Answer Structure - INCLUDE ACTUAL DATA**:
35
+ - If this is a comparative query, clearly organize by the comparison dimensions WITH ACTUAL NUMBERS
36
+ - If this is an analytical query (most/least/best/worst), perform the analysis USING THE ACTUAL DATA VALUES
37
+ - Preserve important details from RELEVANT research
38
+ - Use natural language, be clear and organized
39
+ - Cite which poll(s), survey dates, or stage(s) information comes from
40
+ - Format numbers and percentages clearly - ALWAYS include the actual values from the data
41
+ - For crosstabs: Extract and present the actual breakdown percentages/numbers for each demographic group
42
+ - DO NOT provide generic descriptions - provide specific numbers, percentages, and data points from the retrieved data
43
+
44
+ 5. **When No Relevant Data Found**:
45
+ - If multiple stages returned irrelevant data, clearly state: "The retrieved data does not match your question about [topic]. The available data is about [different topics found]. I cannot provide an answer to your specific question with the current data."
46
+ - Do NOT synthesize irrelevant information just because it was retrieved.
47
+
48
+ 6. **Data Integrity**:
49
+ - Do NOT make up information not in the retrieved raw data
50
+ - Use only the data provided in the context above
51
+ - If percentages or numbers are provided, use them accurately
52
+ - TODO: REMOVE WHEN PIPELINES READY - If some data sources weren't available, clearly state this and explain what you CAN provide
53
+
prompts/toplines_rag_prompt.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ You are a polling analyst. Use the following Vanderbilt Unity Poll topline data to answer the question accurately.
2
+
3
+ Question: {query}
4
+
5
+ Context:
6
+ {context_snippets}
7
+
8
+ Write a concise, factual summary of the topline results, referencing the poll name and date. If the question is not answerable from this data, politely state that the system only has Vanderbilt Unity Poll data.
9
+
prompts/verification_prompt_system.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ You are a verification expert. Your ONLY job is to check if the retrieved RAW DATA/Questions matches what the user asked for.
2
+
3
+ CRITICAL RULES:
4
+ 1. **Match the question literally** - Don't add requirements the user didn't ask for
5
+ - If they asked "what questions were asked?" and we retrieved questions → SUCCESS
6
+ - If they asked "what are the results/percentages?" and we only have questions → FAILURE
7
+ - If they asked "how do responses vary by X?" and we have crosstabs → SUCCESS
8
+
9
+ 2. **Examine the raw data structure**:
10
+ - For questionnaire: Check if source_questions exist and match the query topic/intent
11
+ - For toplines: Check if retrieved_docs exist and contain relevant response data
12
+ - For crosstabs: Check if crosstab_docs_by_variable exist for the requested breakdown
13
+
14
+ 3. **Only fail if there's an actual problem**:
15
+ - We retrieved the wrong type of data (e.g., questions when they asked for results)
16
+ - We retrieved from the wrong time period/survey (AND this is confirmed by mismatched filters AND raw data time period)
17
+ - The retrieved data doesn't match the query topic (e.g., asked about "economy" but got "politics")
18
+ - We have no data when the user asked for specific data
19
+
20
+ 4. **Do NOT fail if**:
21
+ - User asked for questions and we got questions (even if we don't have "analysis")
22
+ - User asked for data from June 2025 and filters show year=2025, month=June AND raw data shows the same period → APPROVE
23
+ - The time period in the raw data matches the filters applied → TRUST the filtering system worked correctly
24
+ - The data seems sufficient to answer their actual question
25
+ - We have at least some relevant data (even if not perfect)
26
+
27
+ 5. **CRITICAL: Time Period Verification**:
28
+ - If filters were applied (year=2025, month=June) AND the raw data details show "Time period: June 2025" → APPROVE
29
+ - The retrieval system filters data BEFORE returning it, so if filters match the time period shown, trust it
30
+ - Only fail if there's a CLEAR mismatch (e.g., filters say June 2025 but raw data shows March 2024)
31
+
32
+ Be practical, not pedantic. If the retrieved raw data can answer what they asked, approve it.
33
+
prompts/verification_prompt_user.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ User question: "{question}"
2
+
3
+ Filters applied during retrieval:
4
+ {filters_applied}
5
+
6
+ Summary of retrieved data:
7
+ {retrieval_summary}
8
+
9
+ Detailed raw data structure:
10
+ {raw_data_details}
11
+
12
+ Question: Can we answer the user's question with this raw data?
13
+ - Consider the TYPE of data (questions vs results vs crosstabs)
14
+ - Consider the TOPIC relevance (does the data match what they asked about?)
15
+ - Consider the TIME PERIOD (is it from the right survey/month?) - Check both the filters applied AND the time period shown in the raw data details
16
+ - Consider the QUANTITY (do we have at least some data?)
17
+
18
+ IMPORTANT: If filters were applied (e.g., year=2025, month=June) AND the raw data details show the same time period, trust that the time period is correct. The retrieval system filters data before returning it.
19
+
20
+ Answer YES only if the raw data is sufficient to answer their question. Answer NO if data is missing, wrong type, wrong topic, or wrong time period.
21
+
questionnaire_rag.py CHANGED
@@ -27,6 +27,15 @@ except ImportError:
27
  pass
28
 
29
 
 
 
 
 
 
 
 
 
 
30
  class QuestionnaireRAG:
31
  """
32
  Improved questionnaire RAG with:
@@ -267,6 +276,28 @@ class QuestionnaireRAG:
267
  print(f"⚠️ Filtered out: wrong survey {metadata.get('survey_name')} != {matched_name}")
268
  valid = False
269
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
270
  if valid:
271
  validated_docs.append(doc)
272
 
@@ -274,49 +305,9 @@ class QuestionnaireRAG:
274
 
275
  def _get_prompt(self) -> ChatPromptTemplate:
276
  """Get the improved system prompt with anti-hallucination measures"""
 
277
  return ChatPromptTemplate.from_messages([
278
- ("system", """You are an expert assistant for analyzing poll questionnaires.
279
-
280
- 🚨 CRITICAL RULES - NEVER VIOLATE THESE:
281
-
282
- 1. **ONLY use information from the provided context**
283
- - Do NOT make up questions, polls, or dates
284
- - Do NOT assume a poll exists if it's not in the context
285
- - If information is missing, say "I don't have data for [X]" rather than making it up
286
-
287
- 2. **Verify data exists before listing it**
288
- - Before mentioning any poll, check it's actually in the context
289
- - Before listing questions, confirm they exist in the retrieved data
290
- - If asked about multiple time periods, explicitly state which ones have data and which don't
291
-
292
- 3. **Be explicit about what's NOT in the data**
293
- - If asked about "2024 and 2025" but only 2025 data exists, say: "I have data for 2025, but there is no 2024 data in the retrieved results"
294
- - Never silently skip missing data - always acknowledge it
295
-
296
- 4. **When listing questions:**
297
- - List ALL questions from the context in order
298
- - Include full question text and response options
299
- - Note sampling inline in clear language:
300
- * "Asked to all respondents" (not "ASK ALL")
301
- * "Asked to half the sample" (not "HALFSAMP1=1")
302
- * "Asked only if [condition]" (not technical codes)
303
- - If sibling variants exist, note "One of two versions shown to different groups"
304
- - Always cite which poll(s) you're using
305
-
306
- 5. **Format for scannability:**
307
- - Use numbered lists for questions
308
- - Bold question text
309
- - Include response options as bullet points
310
- - Put sampling info in parentheses after question
311
-
312
- Available polls in the system (for reference):
313
- {catalog}
314
-
315
- Context (ONLY source of truth):
316
- {context}
317
-
318
- Question: {question}
319
- """),
320
  ("human", "Answer:")
321
  ])
322
 
@@ -349,6 +340,78 @@ Question: {question}
349
  """
350
  return self._query_internal(question, filters, k)
351
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
352
  def _query_internal(
353
  self,
354
  question: str,
 
27
  pass
28
 
29
 
30
+ def _load_prompt_file(filename: str) -> str:
31
+ """Load a prompt file from the prompts directory"""
32
+ prompt_dir = Path(__file__).parent / "prompts"
33
+ prompt_path = prompt_dir / filename
34
+ if not prompt_path.exists():
35
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
36
+ return prompt_path.read_text(encoding="utf-8")
37
+
38
+
39
  class QuestionnaireRAG:
40
  """
41
  Improved questionnaire RAG with:
 
276
  print(f"⚠️ Filtered out: wrong survey {metadata.get('survey_name')} != {matched_name}")
277
  valid = False
278
 
279
+ # Check topic (if topic filter is provided)
280
+ if "topic" in filters and valid:
281
+ expected_topic = filters["topic"].lower()
282
+ # Topics are stored as comma-separated string in metadata
283
+ doc_topics = metadata.get("topics", "")
284
+ if isinstance(doc_topics, str):
285
+ doc_topics_list = [t.strip().lower() for t in doc_topics.split(",")]
286
+ elif isinstance(doc_topics, list):
287
+ doc_topics_list = [str(t).strip().lower() for t in doc_topics]
288
+ else:
289
+ doc_topics_list = []
290
+
291
+ if self.verbose and valid:
292
+ var_name = metadata.get("variable_name", "unknown")
293
+ print(f" 🔍 Checking topic '{expected_topic}' for {var_name}: doc_topics={doc_topics_list}")
294
+
295
+ if expected_topic not in doc_topics_list:
296
+ if self.verbose:
297
+ var_name = metadata.get("variable_name", "unknown")
298
+ print(f"⚠️ Filtered out {var_name}: topic '{expected_topic}' not in {doc_topics_list}")
299
+ valid = False
300
+
301
  if valid:
302
  validated_docs.append(doc)
303
 
 
305
 
306
  def _get_prompt(self) -> ChatPromptTemplate:
307
  """Get the improved system prompt with anti-hallucination measures"""
308
+ system_prompt_template = _load_prompt_file("questionnaire_rag_prompt.txt")
309
  return ChatPromptTemplate.from_messages([
310
+ ("system", system_prompt_template),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  ("human", "Answer:")
312
  ])
313
 
 
340
  """
341
  return self._query_internal(question, filters, k)
342
 
343
+ def retrieve_raw_data(
344
+ self,
345
+ question: str,
346
+ filters: Optional[Dict[str, Any]] = None,
347
+ k: int = 20
348
+ ) -> Dict[str, Any]:
349
+ """
350
+ Retrieve raw data without LLM formatting.
351
+ Used by agent framework to get raw data for synthesis.
352
+
353
+ Returns:
354
+ Dict with 'source_questions', 'num_sources', 'filters_applied', 'retrieved_docs'
355
+ """
356
+ if self.verbose:
357
+ print(f"\n📊 [Raw Data] Query: {question}")
358
+ if filters:
359
+ print(f"🔍 Filters: {filters}")
360
+
361
+ # Build Pinecone filter
362
+ pinecone_filter = self._build_pinecone_filter(filters or {})
363
+
364
+ # Retrieve documents
365
+ if pinecone_filter:
366
+ if self.verbose:
367
+ print(f"🔧 Pinecone filter: {pinecone_filter}")
368
+ retriever = self.vectorstore.as_retriever(
369
+ search_kwargs={"k": k, "filter": pinecone_filter}
370
+ )
371
+ else:
372
+ retriever = self.vectorstore.as_retriever(search_kwargs={"k": k})
373
+
374
+ docs = retriever.invoke(question)
375
+
376
+ if self.verbose:
377
+ print(f"📥 Retrieved {len(docs)} documents from Pinecone")
378
+
379
+ # Validate results match filters
380
+ if filters:
381
+ docs = self._validate_results(docs, filters)
382
+ if self.verbose:
383
+ print(f"✅ After validation: {len(docs)} documents")
384
+
385
+ # Check if we have any results
386
+ if not docs:
387
+ return {
388
+ "source_questions": [],
389
+ "num_sources": 0,
390
+ "filters_applied": filters or {},
391
+ "retrieved_docs": []
392
+ }
393
+
394
+ # Reconstruct full questions
395
+ full_questions = []
396
+ seen_ids = set()
397
+
398
+ for doc in docs:
399
+ q_id = doc.metadata.get('question_id')
400
+ if q_id and q_id not in seen_ids:
401
+ if q_id in self.questions_by_id:
402
+ full_questions.append(self.questions_by_id[q_id])
403
+ seen_ids.add(q_id)
404
+
405
+ # Sort by position to maintain survey order
406
+ full_questions.sort(key=lambda q: (q.get('poll_date', ''), q.get('position', 0)))
407
+
408
+ return {
409
+ 'source_questions': full_questions,
410
+ 'num_sources': len(full_questions),
411
+ 'filters_applied': filters or {},
412
+ 'retrieved_docs': docs
413
+ }
414
+
415
  def _query_internal(
416
  self,
417
  question: str,
survey_agent.py CHANGED
@@ -13,8 +13,8 @@ When new pipelines (toplines, crosstabs, SQL) become available:
13
 
14
  Current Status:
15
  - ✅ Questionnaire pipeline: ACTIVE
16
- - Toplines pipeline: Not yet implemented
17
- - Crosstabs pipeline: Not yet implemented
18
  - ⏳ SQL pipeline: Not yet implemented
19
  """
20
 
@@ -32,6 +32,21 @@ from pydantic import BaseModel, Field, ConfigDict
32
 
33
  # Import the questionnaire RAG
34
  from questionnaire_rag import QuestionnaireRAG
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  try:
37
  from dotenv import load_dotenv
@@ -47,7 +62,7 @@ except ImportError:
47
  class QueryFilters(BaseModel):
48
  """Filters for data source queries - Pydantic v2 with strict schema"""
49
  model_config = ConfigDict(extra="forbid")
50
-
51
  year: Optional[int] = Field(default=None, description="Year filter (e.g., 2025)")
52
  month: Optional[str] = Field(default=None, description="Month filter (e.g., 'February')")
53
  poll_date: Optional[str] = Field(default=None, description="Specific poll date (e.g., '2025-02-15')")
@@ -59,7 +74,7 @@ class QueryFilters(BaseModel):
59
  class DataSource(BaseModel):
60
  """Represents a data source to query"""
61
  model_config = ConfigDict(extra="forbid")
62
-
63
  source_type: Literal["questionnaire", "toplines", "crosstabs", "sql"]
64
  query_description: str = Field(description="What to retrieve from this source")
65
  filters: QueryFilters = Field(default_factory=QueryFilters, description="Filters to apply")
@@ -69,13 +84,13 @@ class DataSource(BaseModel):
69
  class ResearchStage(BaseModel):
70
  """A single stage in a multi-stage research plan"""
71
  model_config = ConfigDict(extra="forbid")
72
-
73
  stage_number: int = Field(description="Stage number (1-indexed)")
74
  description: str = Field(description="What this stage accomplishes")
75
  data_sources: List[DataSource] = Field(description="Data sources to query in this stage")
76
  depends_on_stages: List[int] = Field(default_factory=list, description="Which prior stages this depends on")
77
  use_previous_results_for: Optional[str] = Field(
78
- default=None,
79
  description="How to use previous stage results (e.g., 'Extract question IDs from stage 1')"
80
  )
81
 
@@ -83,14 +98,14 @@ class ResearchStage(BaseModel):
83
  class ResearchBrief(BaseModel):
84
  """Research brief - can be either single-stage or multi-stage"""
85
  model_config = ConfigDict(extra="forbid")
86
-
87
  action: Literal["answer", "followup", "route_to_sources", "execute_stages"]
88
  followup_question: Optional[str] = Field(default=None, description="Follow-up question to ask user")
89
  reasoning: str = Field(description="Why this approach was chosen")
90
-
91
  # For simple queries (single-stage)
92
  data_sources: List[DataSource] = Field(default_factory=list, description="Data sources for simple queries")
93
-
94
  # For complex queries (multi-stage)
95
  stages: List[ResearchStage] = Field(default_factory=list, description="Ordered stages of research")
96
 
@@ -98,7 +113,7 @@ class ResearchBrief(BaseModel):
98
  class StageResult(BaseModel):
99
  """Results from executing one stage"""
100
  model_config = ConfigDict(extra="forbid")
101
-
102
  stage_number: int
103
  status: Literal["success", "partial", "failed"]
104
  questionnaire_results: Optional[Dict[str, Any]] = None
@@ -114,7 +129,7 @@ class StageResult(BaseModel):
114
  class VerificationResult(BaseModel):
115
  """Result of verifying if data answers the question"""
116
  model_config = ConfigDict(extra="forbid")
117
-
118
  answers_question: bool = Field(description="Whether the data fully answers the question")
119
  missing_info: Optional[str] = Field(default=None, description="What information is missing")
120
  improvement_suggestion: Optional[str] = Field(default=None, description="How to improve the research brief")
@@ -125,24 +140,24 @@ class SurveyAnalysisState(TypedDict):
125
  # User interaction
126
  messages: Annotated[List, operator.add]
127
  user_question: str
128
-
129
  # Planning
130
  research_brief: Optional[ResearchBrief]
131
-
132
  # Stage execution
133
  current_stage: int # Which stage we're executing (0-indexed internally, but 1-indexed in models)
134
  stage_results: List[StageResult] # Results from each completed stage
135
-
136
  # Legacy single-stage results (for backward compatibility)
137
  questionnaire_results: Optional[Dict[str, Any]]
138
  toplines_results: Optional[Dict[str, Any]]
139
  crosstabs_results: Optional[Dict[str, Any]]
140
  sql_results: Optional[Dict[str, Any]]
141
-
142
  # Verification & synthesis
143
  verification: Optional[VerificationResult]
144
  final_answer: Optional[str]
145
-
146
  # Control flow
147
  retry_count: int
148
  max_retries: int
@@ -155,7 +170,7 @@ class SurveyAnalysisState(TypedDict):
155
  class SurveyAnalysisAgent:
156
  """
157
  Multi-agent system for analyzing survey data with staged research briefs.
158
-
159
  Flow:
160
  1. User asks question
161
  2. Research brief agent decides: simple (one-shot) or complex (staged)
@@ -163,12 +178,12 @@ class SurveyAnalysisAgent:
163
  4. For complex: execute stages sequentially, each using previous results
164
  5. Final synthesis combines all stage results
165
  """
166
-
167
  # TODO: REMOVE WHEN PIPELINES READY - START
168
  # Track which pipelines are currently available
169
- AVAILABLE_PIPELINES = {"questionnaire"} # Add "toplines", "crosstabs", "sql" as they become ready
170
  # TODO: REMOVE WHEN PIPELINES READY - END
171
-
172
  def __init__(
173
  self,
174
  openai_api_key: str,
@@ -181,13 +196,13 @@ class SurveyAnalysisAgent:
181
  self.pinecone_api_key = pinecone_api_key
182
  self.verbose = verbose
183
  self.max_retries = max_retries
184
-
185
  # Initialize LLM
186
  self.llm = ChatOpenAI(
187
  model=os.getenv("OPENAI_MODEL", "gpt-4o"),
188
  temperature=0
189
  )
190
-
191
  # Initialize questionnaire RAG
192
  if self.verbose:
193
  print("Initializing questionnaire RAG system...")
@@ -197,28 +212,38 @@ class SurveyAnalysisAgent:
197
  persist_directory=questionnaire_persist_dir,
198
  verbose=verbose
199
  )
200
-
 
 
 
 
 
 
 
 
 
 
201
  # Build the graph
202
  self.graph = self._build_graph()
203
-
204
  if self.verbose:
205
  print("✓ Survey analysis agent initialized with staged research capability")
206
-
207
  def _build_graph(self) -> StateGraph:
208
  """Build the LangGraph workflow with staged research support"""
209
-
210
  workflow = StateGraph(SurveyAnalysisState)
211
-
212
  # Add nodes
213
  workflow.add_node("generate_research_brief", self._generate_research_brief)
214
  workflow.add_node("execute_stage", self._execute_stage)
215
  workflow.add_node("extract_stage_context", self._extract_stage_context)
216
  workflow.add_node("verify_results", self._verify_results)
217
  workflow.add_node("synthesize_response", self._synthesize_response)
218
-
219
  # Define edges
220
  workflow.add_edge(START, "generate_research_brief")
221
-
222
  # After research brief, route based on action
223
  workflow.add_conditional_edges(
224
  "generate_research_brief",
@@ -229,10 +254,10 @@ class SurveyAnalysisAgent:
229
  "execute_stage": "execute_stage"
230
  }
231
  )
232
-
233
  # After stage execution, extract context for next stage
234
  workflow.add_edge("execute_stage", "extract_stage_context")
235
-
236
  # After context extraction, decide next step
237
  workflow.add_conditional_edges(
238
  "extract_stage_context",
@@ -242,7 +267,7 @@ class SurveyAnalysisAgent:
242
  "verify": "verify_results" # All stages done, verify
243
  }
244
  )
245
-
246
  # After verification, decide next step
247
  workflow.add_conditional_edges(
248
  "verify_results",
@@ -253,27 +278,66 @@ class SurveyAnalysisAgent:
253
  "give_up": "synthesize_response"
254
  }
255
  )
256
-
257
  # End after synthesis
258
  workflow.add_edge("synthesize_response", END)
259
-
260
  # Compile with memory
261
  memory = MemorySaver()
262
  return workflow.compile(checkpointer=memory)
263
-
264
  def _get_available_surveys_description(self) -> str:
265
  """Get formatted description of available surveys for LLM prompt"""
266
  survey_names = self.questionnaire_rag.get_available_survey_names()
267
-
268
  if not survey_names:
269
  return "No surveys currently loaded."
270
-
271
  lines = ["Available survey names in the system:"]
272
  for name in survey_names:
273
  lines.append(f" - '{name}'")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
274
 
275
  return "\n".join(lines)
276
-
277
  # TODO: REMOVE WHEN PIPELINES READY - START
278
  def _get_pipeline_status_description(self) -> str:
279
  """Get description of available vs unavailable pipelines"""
@@ -283,61 +347,101 @@ class SurveyAnalysisAgent:
283
  "crosstabs": "Pre-computed cross-tabulations by demographics",
284
  "sql": "Raw survey responses for custom analysis"
285
  }
286
-
287
  lines = []
288
  for pipeline, description in all_pipelines.items():
289
  status = "✅ AVAILABLE" if pipeline in self.AVAILABLE_PIPELINES else "❌ NOT YET AVAILABLE"
290
  lines.append(f"{pipeline.capitalize()}: {description} {status}")
291
-
292
  return "\n".join(lines)
293
  # TODO: REMOVE WHEN PIPELINES READY - END
294
-
295
  def _get_full_question_context(self, state: SurveyAnalysisState) -> str:
296
  """
297
  Build full question context from conversation history.
298
-
299
  IMPORTANT: Only look at the LATEST user message for the current query.
300
  Previous messages provide context but the latest message is what we're answering.
301
  """
302
  messages = state.get("messages", [])
303
-
304
  # Extract all human messages
305
  human_messages = []
306
  for msg in messages:
307
  if isinstance(msg, HumanMessage):
308
  human_messages.append(msg.content)
309
-
310
  if not human_messages:
311
  return state["user_question"]
312
-
313
  # For planning, just use the latest message
314
  # Don't combine with previous messages as that causes misinterpretation
315
  latest_message = human_messages[-1]
316
-
317
  if self.verbose:
318
  print(f"📝 Conversation history: {len(human_messages)} user message(s)")
319
  for i, msg in enumerate(human_messages, 1):
320
  print(f" {i}. {msg[:100]}..." if len(msg) > 100 else f" {i}. {msg}")
321
  print(f"🎯 Answering latest: {latest_message}")
322
-
323
  return latest_message
324
 
325
-
326
  # ========================================================================
327
  # NODE FUNCTIONS
328
  # ========================================================================
329
-
330
  def _generate_research_brief(self, state: SurveyAnalysisState) -> Dict[str, Any]:
331
  """Generate research brief - decides single-stage vs multi-stage approach"""
332
-
333
  if self.verbose:
334
  print("\n=== GENERATING RESEARCH BRIEF ===")
 
 
 
 
335
 
336
- # Get the current question (latest message only)
337
  question = self._get_full_question_context(state)
338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  retry_count = state.get("retry_count", 0)
340
-
341
  # Add context from verification if this is a retry
342
  verification_context = ""
343
  if state.get("verification") and retry_count > 0:
@@ -348,93 +452,34 @@ Previous attempt was insufficient:
348
 
349
  Please improve the research plan based on this feedback.
350
  """
351
-
352
- system_prompt = f"""You are a research planning expert for survey data analysis.
353
-
354
- # TODO: REMOVE WHEN PIPELINES READY - Use dynamic status
355
- Available data sources:
356
- {self._get_pipeline_status_description()}
357
-
358
- # TODO: REMOVE WHEN PIPELINES READY - START
359
- ⚠️ IMPORTANT: Currently ONLY the questionnaire pipeline is available.
360
- - Do NOT create research plans that require toplines, crosstabs, or SQL
361
- - If the user asks for results/data/analysis that requires those sources, use action="followup" to inform them
362
- - Focus on what CAN be answered with questionnaires alone (question text, response options, topics, skip logic)
363
- # TODO: REMOVE WHEN PIPELINES READY - END
364
-
365
- {self._get_available_surveys_description()}
366
-
367
- You have FOUR possible actions:
368
-
369
- **1. followup** - Ask clarifying question if ambiguous OR if user asks for unavailable data
370
-
371
- **2. answer** - Answer directly without data (system questions, general knowledge)
372
-
373
- **3. route_to_sources** - Simple query that can be answered with one-shot data retrieval
374
- Use this for MOST queries including:
375
- - "What questions were asked in June 2025?" ← Use this
376
- - "Show me all healthcare questions" ← Use this
377
- - "What about June 2025?" (when June 2022 was discussed before) ← Use this for the NEW date only
378
- - ANY query asking about a SINGLE time period or survey ← Use this
379
-
380
- CRITICAL: When user mentions a new time period, retrieve ONLY that period.
381
- Do NOT create comparative queries unless explicitly requested!
382
-
383
- **4. execute_stages** - Complex query requiring SEQUENTIAL staged research
384
- Use this ONLY for:
385
- - Explicit comparisons: "compare 2024 vs 2025", "what changed between surveys"
386
- - Queries with "most/least/best/worst" needing analysis across multiple retrievals
387
- - Queries explicitly asking for differences or changes
388
-
389
- DO NOT use stages for simple follow-up questions about different time periods!
390
-
391
- # TODO: REMOVE WHEN PIPELINES READY - START
392
- NOTE: Since toplines/crosstabs/SQL aren't available, only use execute_stages for explicit comparisons
393
- # TODO: REMOVE WHEN PIPELINES READY - END
394
-
395
- CRITICAL RULES FOR CONVERSATION CONTEXT:
396
- - When user says "what about [X]?" they're asking a NEW question about X
397
- - Do NOT assume they want to compare with previous topics
398
- - "What about June 2025?" means "show me June 2025" (NOT "compare with June 2022")
399
- - Only create multi-stage queries when user EXPLICITLY asks to compare
400
-
401
- FILTERING RULES:
402
- - Extract survey name from user query and map to exact stored name
403
- - "Unity Poll" → "Vanderbilt_Unity_Poll"
404
- - Be precise with year and month extraction
405
- - If user just provides date/survey, infer they want questions from that period
406
-
407
- {verification_context}
408
-
409
- Examples:
410
-
411
- User: "what questions were asked in June 2025?"
412
- Brief: action=route_to_sources, retrieve June 2025 questions
413
-
414
- User: "what about June 2025?" (after discussing June 2022)
415
- Brief: action=route_to_sources, retrieve June 2025 questions ← NOT staged!
416
-
417
- User: "compare June 2024 vs June 2025"
418
- Brief: action=execute_stages, stage 1: 2024, stage 2: 2025 ← This needs stages
419
-
420
- User: "june 2022 unity poll"
421
- Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_Unity_Poll'
422
- """
423
-
424
  brief_generator = self.llm.with_structured_output(ResearchBrief)
425
-
 
 
 
 
426
  brief = brief_generator.invoke([
427
  SystemMessage(content=system_prompt),
428
- HumanMessage(content=f"User question: {question}\n\nGenerate a research brief.")
429
  ])
430
-
431
  if self.verbose:
432
  print(f"Action: {brief.action}")
433
  print(f"Reasoning: {brief.reasoning}")
434
-
435
  if brief.followup_question:
436
  print(f"Follow-up: {brief.followup_question}")
437
-
438
  if brief.action == "route_to_sources" and brief.data_sources:
439
  print(f"Simple query - {len(brief.data_sources)} data sources")
440
  for ds in brief.data_sources:
@@ -442,7 +487,7 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
442
  print(f" - {ds.source_type}: {ds.query_description}")
443
  if filters_dict:
444
  print(f" Filters: {filters_dict}")
445
-
446
  if brief.action == "execute_stages" and brief.stages:
447
  print(f"Staged query - {len(brief.stages)} stages")
448
  for stage in brief.stages:
@@ -455,18 +500,18 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
455
  print(f" - {ds.source_type}: {ds.query_description}")
456
  if ds.result_label:
457
  print(f" Label: {ds.result_label}")
458
-
459
  return {
460
  "research_brief": brief,
461
  "current_stage": 0, # Start at stage 0 (will execute stage 1 first)
462
  "stage_results": [],
463
  "messages": [AIMessage(content=f"[Research plan: {brief.action}]")]
464
  }
465
-
466
  def _route_after_brief(self, state: SurveyAnalysisState) -> str:
467
  """Route based on research brief action"""
468
  brief = state["research_brief"]
469
-
470
  if brief.action == "followup":
471
  return "followup"
472
  elif brief.action == "answer":
@@ -475,34 +520,34 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
475
  return "execute_stage"
476
  else: # route_to_sources
477
  return "execute_stage" # We'll handle both single and staged in execute_stage
478
-
479
  def _execute_stage(self, state: SurveyAnalysisState) -> Dict[str, Any]:
480
  """Execute one stage of research (handles both single-stage and multi-stage)"""
481
-
482
  brief = state["research_brief"]
483
  current_stage_idx = state.get("current_stage", 0)
484
  previous_stage_results = state.get("stage_results", [])
485
-
486
  # Determine if this is single-stage or multi-stage
487
  if brief.action == "route_to_sources":
488
  # Single-stage: use data_sources directly
489
  if self.verbose:
490
  print(f"\n=== EXECUTING SINGLE-STAGE RESEARCH ===")
491
-
492
  stage_data_sources = brief.data_sources
493
  stage_desc = "Single-stage retrieval"
494
-
495
  elif brief.action == "execute_stages":
496
  # Multi-stage: get current stage
497
  stage = brief.stages[current_stage_idx]
498
-
499
  if self.verbose:
500
  print(f"\n=== EXECUTING STAGE {stage.stage_number}/{len(brief.stages)} ===")
501
  print(f"Description: {stage.description}")
502
-
503
  stage_data_sources = stage.data_sources
504
  stage_desc = stage.description
505
-
506
  # If this stage depends on previous stages, enrich filters with context
507
  if stage.use_previous_results_for and previous_stage_results:
508
  stage_data_sources = self._enrich_data_sources_with_context(
@@ -512,48 +557,52 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
512
  )
513
  else:
514
  return {}
515
-
516
  # Execute pipelines for this stage
517
  stage_result = StageResult(
518
  stage_number=current_stage_idx + 1,
519
  status="success"
520
  )
521
-
522
  # TODO: REMOVE WHEN PIPELINES READY - Track what was attempted vs available
523
  attempted_pipelines = []
524
  unavailable_pipelines = []
525
-
526
  # Run each pipeline
527
  for ds in stage_data_sources:
528
  filters_dict = {k: v for k, v in ds.filters.model_dump().items() if v is not None}
529
-
530
  # TODO: REMOVE WHEN PIPELINES READY - START
531
  attempted_pipelines.append(ds.source_type)
532
  # TODO: REMOVE WHEN PIPELINES READY - END
533
-
534
  if ds.source_type == "questionnaire":
535
  if self.verbose:
536
- print(f"\nQuerying questionnaire: {ds.query_description}")
 
537
  if filters_dict:
538
- print(f"Filters: {filters_dict}")
539
-
540
- result = self.questionnaire_rag.query_with_metadata(
541
  question=ds.query_description,
542
  filters=filters_dict if filters_dict else None
543
  )
544
-
545
  # Store with label if provided
546
  if ds.result_label:
547
  result["label"] = ds.result_label
548
-
549
  stage_result.questionnaire_results = result if stage_result.questionnaire_results is None else {
550
  "multiple": True,
551
  "results": [stage_result.questionnaire_results, result]
552
  }
553
-
554
  if self.verbose:
555
- print(f"Retrieved {result['num_sources']} questions")
556
-
 
 
 
557
  # TODO: REMOVE WHEN PIPELINES READY - START
558
  elif ds.source_type not in self.AVAILABLE_PIPELINES:
559
  unavailable_pipelines.append(ds.source_type)
@@ -561,7 +610,103 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
561
  print(f"\n⚠️ {ds.source_type.upper()} pipeline not yet available - skipping")
562
  print(f" Requested: {ds.query_description}")
563
  # TODO: REMOVE WHEN PIPELINES READY - END
564
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
565
  # TODO: REMOVE WHEN PIPELINES READY - START
566
  # Add a note about unavailable pipelines to the stage result
567
  if unavailable_pipelines:
@@ -573,10 +718,10 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
573
  stage_result.extracted_context = {}
574
  stage_result.extracted_context["unavailable_pipelines"] = unavailable_pipelines
575
  # TODO: REMOVE WHEN PIPELINES READY - END
576
-
577
  # Add stage result to list
578
  updated_stage_results = previous_stage_results + [stage_result]
579
-
580
  # For single-stage, also populate legacy fields
581
  if brief.action == "route_to_sources":
582
  return {
@@ -587,12 +732,12 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
587
  "crosstabs_results": stage_result.crosstabs_results,
588
  "sql_results": stage_result.sql_results
589
  }
590
-
591
  return {
592
  "stage_results": updated_stage_results,
593
  "current_stage": current_stage_idx + 1 # FIXED: Increment stage counter
594
  }
595
-
596
  def _enrich_data_sources_with_context(
597
  self,
598
  data_sources: List[DataSource],
@@ -600,10 +745,10 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
600
  use_instruction: str
601
  ) -> List[DataSource]:
602
  """Enrich data sources with context from previous stages"""
603
-
604
  if self.verbose:
605
  print(f" Enriching with context: {use_instruction}")
606
-
607
  # For now, handle the most common case: extracting question IDs
608
  if "question" in use_instruction.lower() and "id" in use_instruction.lower():
609
  # Extract question IDs from previous questionnaire results
@@ -613,66 +758,66 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
613
  q_results = prev_result.questionnaire_results
614
  if "source_questions" in q_results:
615
  question_ids.extend([q.get("question_id") for q in q_results["source_questions"]])
616
-
617
  if question_ids and self.verbose:
618
  print(f" Found {len(question_ids)} question IDs from previous stages")
619
-
620
  # Add question_ids to filters
621
  enriched_sources = []
622
  for ds in data_sources:
623
  new_filters = ds.filters.model_copy()
624
  new_filters.question_ids = question_ids if question_ids else None
625
-
626
  enriched_ds = ds.model_copy()
627
  enriched_ds.filters = new_filters
628
  enriched_sources.append(enriched_ds)
629
-
630
  return enriched_sources
631
-
632
  return data_sources
633
-
634
  def _extract_stage_context(self, state: SurveyAnalysisState) -> Dict[str, Any]:
635
  """Extract key context from completed stage for use in next stages"""
636
-
637
  stage_results = state.get("stage_results", [])
638
  if not stage_results:
639
  return {}
640
-
641
  current_result = stage_results[-1]
642
-
643
  # Extract question IDs if questionnaire results exist
644
  extracted_context = {}
645
-
646
  if current_result.questionnaire_results:
647
  q_results = current_result.questionnaire_results
648
  if "source_questions" in q_results:
649
  question_ids = [q.get("question_id") for q in q_results["source_questions"]]
650
  extracted_context["question_ids"] = question_ids
651
-
652
  if self.verbose:
653
  print(f"\n=== EXTRACTED CONTEXT FROM STAGE {current_result.stage_number} ===")
654
  print(f"Question IDs: {len(question_ids)} extracted")
655
-
656
  # Update the stage result with extracted context
657
  current_result.extracted_context = extracted_context
658
-
659
  return {}
660
-
661
  def _route_after_stage(self, state: SurveyAnalysisState) -> str:
662
  """Decide if we need to execute another stage or move to verification"""
663
-
664
  brief = state["research_brief"]
665
  current_stage_idx = state.get("current_stage", 0)
666
-
667
  # Single-stage query
668
  if brief.action == "route_to_sources":
669
  if self.verbose:
670
  print("\n=== SINGLE-STAGE COMPLETE → VERIFICATION ===")
671
  return "verify"
672
-
673
  # Multi-stage query
674
  total_stages = len(brief.stages)
675
-
676
  # FIXED: Don't add 1 here since current_stage was already incremented
677
  if current_stage_idx < total_stages:
678
  if self.verbose:
@@ -682,84 +827,190 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
682
  if self.verbose:
683
  print(f"\n=== ALL {total_stages} STAGES COMPLETE → VERIFICATION ===")
684
  return "verify"
685
-
686
  def _verify_results(self, state: SurveyAnalysisState) -> Dict[str, Any]:
687
  """Verify that retrieved data answers the question"""
688
-
689
  if self.verbose:
690
  print("\n=== VERIFYING RESULTS ===")
691
-
692
  # Use the latest question only
693
  question = self._get_full_question_context(state)
694
-
695
  stage_results = state.get("stage_results", [])
696
  brief = state["research_brief"]
697
-
698
- # Build summary of what we retrieved
699
  retrieval_summary = []
 
 
700
  total_questions = 0
701
-
 
 
702
  # TODO: REMOVE WHEN PIPELINES READY - START
703
  unavailable_pipelines_found = []
704
  # TODO: REMOVE WHEN PIPELINES READY - END
705
-
706
  for stage_result in stage_results:
 
 
 
707
  if stage_result.questionnaire_results:
708
  q_res = stage_result.questionnaire_results
709
- num = q_res.get("num_sources", 0)
710
- total_questions += num
711
- retrieval_summary.append(f"Stage {stage_result.stage_number}: Retrieved {num} questions")
712
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
713
  # TODO: REMOVE WHEN PIPELINES READY - START
714
  # Check if any pipelines were unavailable
715
  if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
716
  unavailable = stage_result.extracted_context["unavailable_pipelines"]
717
  unavailable_pipelines_found.extend(unavailable)
718
- retrieval_summary.append(f"Stage {stage_result.stage_number}: ⚠️ {', '.join(unavailable)} not yet available")
719
  # TODO: REMOVE WHEN PIPELINES READY - END
720
-
721
  if not retrieval_summary:
722
  retrieval_summary.append("No data was retrieved")
723
-
724
- # Simple heuristic: if this is a single-stage simple query and we got results, auto-pass
725
- if brief.action == "route_to_sources" and len(stage_results) == 1 and total_questions > 0:
726
- # Check if question is a simple "what questions" type query
727
- question_lower = question.lower()
728
- simple_patterns = ["what question", "which question", "list question", "show question", "questions asked"]
729
-
730
- if any(pattern in question_lower for pattern in simple_patterns):
731
- if self.verbose:
732
- print(f"✓ Auto-pass: Simple question retrieval with {total_questions} results")
733
-
734
- return {
735
- "verification": VerificationResult(
736
- answers_question=True,
737
- missing_info=None,
738
- improvement_suggestion=None
739
- )
740
- }
741
-
742
- # TODO: REMOVE WHEN PIPELINES READY - START
743
- # If we have unavailable pipelines but got questionnaire data, auto-pass with note
744
- if unavailable_pipelines_found and total_questions > 0:
745
- if self.verbose:
746
- print(f"✓ Auto-pass: Got questionnaire data, {len(unavailable_pipelines_found)} pipeline(s) not yet available")
747
-
748
- return {
749
- "verification": VerificationResult(
750
- answers_question=True,
751
- missing_info=None,
752
- improvement_suggestion=None
753
- )
754
- }
755
- # TODO: REMOVE WHEN PIPELINES READY - END
756
-
757
- # If we got 0 results, auto-fail without calling LLM
758
- # CRITICAL: Don't retry - set retry_count to max to skip retry loop
759
- if total_questions == 0:
760
  if self.verbose:
761
  print("✗ Auto-fail: No results retrieved (skipping retry - data doesn't exist)")
762
-
763
  return {
764
  "verification": VerificationResult(
765
  answers_question=False,
@@ -768,67 +1019,48 @@ Brief: action=route_to_sources, year=2022, month=June, survey_name='Vanderbilt_U
768
  ),
769
  "retry_count": state.get("max_retries", self.max_retries) # FIXED: Skip retry
770
  }
771
-
772
- # For other cases, use LLM verification
773
- system_prompt = """You are a verification expert. Your ONLY job is to check if the retrieved data matches what the user asked for.
774
-
775
- CRITICAL RULES:
776
- 1. **Match the question literally** - Don't add requirements the user didn't ask for
777
- - If they asked "what questions were asked?" and we retrieved questions → SUCCESS
778
- - If they asked "what are the results?" and we only have questions → FAILURE
779
-
780
- 2. **Don't overthink it** - Keep it simple:
781
- - Did we retrieve the type of data they asked for? (questions, results, etc.)
782
- - Is it from the right time period/survey they specified?
783
- - Is there enough data (at least 1 result)?
784
-
785
- 3. **Only fail if there's an actual problem**:
786
- - We retrieved the wrong type of data (e.g., questions when they asked for results)
787
- - We retrieved from the wrong time period/survey
788
-
789
- 4. **Do NOT fail if**:
790
- - User asked for questions and we got questions (even if we don't have "analysis")
791
- - User asked for data from June 2025 and that's what we got
792
- - The data seems sufficient to answer their actual question
793
-
794
- Be practical, not pedantic. If the retrieved data can answer what they asked, approve it.
795
- """
796
-
797
  verifier = self.llm.with_structured_output(VerificationResult)
798
-
799
- verification = verifier.invoke([
800
- SystemMessage(content=system_prompt),
801
- HumanMessage(content=f"""
802
- User question: "{question}"
803
 
804
- What we retrieved:
805
- {chr(10).join(retrieval_summary)}
 
 
 
 
 
 
 
806
 
807
- Simple question: Can we answer their question with this data? YES or NO.
808
- """)
 
809
  ])
810
-
811
  if self.verbose:
812
  print(f"Answers question: {verification.answers_question}")
813
  if not verification.answers_question:
814
  print(f"Missing: {verification.missing_info}")
815
  print(f"Suggestion: {verification.improvement_suggestion}")
816
-
817
  # Increment retry count if verification fails
818
  updates = {"verification": verification}
819
  if not verification.answers_question:
820
  current_retry = state.get("retry_count", 0)
821
  updates["retry_count"] = current_retry + 1
822
-
823
  return updates
824
-
825
  def _route_after_verification(self, state: SurveyAnalysisState) -> str:
826
  """Route based on verification result"""
827
-
828
  verification = state["verification"]
829
  retry_count = state.get("retry_count", 0)
830
  max_retries = state.get("max_retries", self.max_retries)
831
-
832
  if verification.answers_question:
833
  return "synthesize"
834
  elif retry_count < max_retries:
@@ -839,18 +1071,18 @@ Simple question: Can we answer their question with this data? YES or NO.
839
  if self.verbose:
840
  print(f"\n⚠️ Max retries reached, proceeding with partial results")
841
  return "give_up"
842
-
843
  def _synthesize_response(self, state: SurveyAnalysisState) -> Dict[str, Any]:
844
  """Synthesize final response from all results"""
845
-
846
  if self.verbose:
847
  print("\n=== SYNTHESIZING RESPONSE ===")
848
-
849
  brief = state["research_brief"]
850
-
851
  # Use the latest question
852
  full_question = self._get_full_question_context(state)
853
-
854
  # Handle followup action
855
  if brief.action == "followup":
856
  if self.verbose:
@@ -859,7 +1091,7 @@ Simple question: Can we answer their question with this data? YES or NO.
859
  "final_answer": brief.followup_question,
860
  "messages": [AIMessage(content=brief.followup_question)]
861
  }
862
-
863
  # Handle direct answer (no data retrieval)
864
  if brief.action == "answer":
865
  if self.verbose:
@@ -868,15 +1100,15 @@ Simple question: Can we answer their question with this data? YES or NO.
868
  SystemMessage(content="Answer the user's question directly."),
869
  HumanMessage(content=full_question)
870
  ]).content
871
-
872
  return {
873
  "final_answer": answer,
874
  "messages": [AIMessage(content=answer)]
875
  }
876
-
877
  # Get stage results
878
  stage_results = state.get("stage_results", [])
879
-
880
  if not stage_results:
881
  if self.verbose:
882
  print("No stage results available")
@@ -884,49 +1116,153 @@ Simple question: Can we answer their question with this data? YES or NO.
884
  "final_answer": "I was unable to retrieve any data to answer your question.",
885
  "messages": [AIMessage(content="I was unable to retrieve any data to answer your question.")]
886
  }
887
-
888
- # CASE 1: Single stage with single pipeline → return direct answer
889
- if len(stage_results) == 1:
890
- stage_result = stage_results[0]
891
-
892
- # Check if only one pipeline returned data
893
- pipelines_with_data = 0
894
- direct_answer = None
895
-
896
- if stage_result.questionnaire_results:
897
- pipelines_with_data += 1
898
- direct_answer = stage_result.questionnaire_results.get("answer")
899
-
900
- if pipelines_with_data == 1 and direct_answer:
901
- if self.verbose:
902
- print("Single stage, single pipeline - returning direct answer (no synthesis)")
903
- return {
904
- "final_answer": direct_answer,
905
- "messages": [AIMessage(content=direct_answer)]
906
- }
907
-
908
- # CASE 2: Multiple stages or multiple pipelines → synthesize
909
  if self.verbose:
910
- print(f"Synthesizing from {len(stage_results)} stage(s)")
911
-
912
- # Build context from all stages
913
  context_parts = []
914
-
915
  # TODO: REMOVE WHEN PIPELINES READY - START
916
  unavailable_pipelines_overall = []
917
  # TODO: REMOVE WHEN PIPELINES READY - END
918
-
919
  for i, stage_result in enumerate(stage_results, 1):
 
920
  if stage_result.questionnaire_results:
921
  q_res = stage_result.questionnaire_results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
922
 
923
- # Check if this is a labeled result
924
- label = q_res.get("label", f"Stage {i}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
925
 
926
- context_parts.append(f"\n=== {label.upper()} ===")
927
- context_parts.append(f"Stage {i} results:")
928
- context_parts.append(q_res.get("answer", "No answer available"))
929
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
930
  # TODO: REMOVE WHEN PIPELINES READY - START
931
  # Track unavailable pipelines for note in synthesis
932
  if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
@@ -934,7 +1270,7 @@ Simple question: Can we answer their question with this data? YES or NO.
934
  unavailable_pipelines_overall.extend(unavailable)
935
  context_parts.append(f"\n⚠️ Note: {', '.join(unavailable)} data was requested but not yet available")
936
  # TODO: REMOVE WHEN PIPELINES READY - END
937
-
938
  # TODO: REMOVE WHEN PIPELINES READY - START
939
  unavailable_note = ""
940
  if unavailable_pipelines_overall:
@@ -947,57 +1283,46 @@ Simple question: Can we answer their question with this data? YES or NO.
947
  Please answer based on the questionnaire data that IS available, and note any limitations.
948
  """
949
  # TODO: REMOVE WHEN PIPELINES READY - END
950
-
951
- synthesis_prompt = f"""Synthesize results from {'multiple stages' if len(stage_results) > 1 else 'the research'} to answer the user's question.
952
 
953
- User question: {full_question}
954
-
955
- Research plan: {brief.reasoning}
956
-
957
- Retrieved data:
958
- {chr(10).join(context_parts)}
959
-
960
- {unavailable_note}
961
 
962
- Instructions:
963
- - If this is a comparative query, clearly organize by the comparison dimensions
964
- - If this is an analytical query (most/least/best/worst), perform the analysis
965
- - Preserve important details from the research
966
- - Use natural language, be clear and organized
967
- - Cite which poll(s) or stage(s) information comes from
968
- - Do NOT make up information not in the retrieved data
969
- - TODO: REMOVE WHEN PIPELINES READY - If some data sources weren't available, clearly state this and explain what you CAN provide
970
- """
971
-
972
  final_answer = self.llm.invoke([
973
- SystemMessage(content="You are a survey data analyst synthesizing research results."),
974
  HumanMessage(content=synthesis_prompt)
975
  ]).content
976
-
977
  if self.verbose:
978
  print("Synthesis complete")
979
-
980
  return {
981
  "final_answer": final_answer,
982
  "messages": [AIMessage(content=final_answer)]
983
  }
984
-
985
  # ========================================================================
986
  # PUBLIC API
987
  # ========================================================================
988
-
989
  def query(self, question: str, thread_id: str = "default") -> str:
990
  """
991
  Query the survey analysis system.
992
-
993
  Args:
994
  question: User's question
995
  thread_id: Conversation thread ID for memory
996
-
997
  Returns:
998
  Answer string
999
  """
1000
-
1001
  # Create initial state for this turn
1002
  initial_state = {
1003
  "messages": [HumanMessage(content=question)],
@@ -1014,22 +1339,22 @@ Instructions:
1014
  "retry_count": 0,
1015
  "max_retries": self.max_retries
1016
  }
1017
-
1018
  config = {
1019
  "configurable": {"thread_id": thread_id},
1020
  "recursion_limit": 50 # FIXED: Increased from default 25
1021
  }
1022
-
1023
  if self.verbose:
1024
  print(f"\n🧵 Thread ID: {thread_id}")
1025
-
1026
  final_state = self.graph.invoke(initial_state, config)
1027
-
1028
  return final_state["final_answer"]
1029
-
1030
  def stream_query(self, question: str, thread_id: str = "default"):
1031
  """Stream the query execution for real-time updates"""
1032
-
1033
  initial_state = {
1034
  "messages": [HumanMessage(content=question)],
1035
  "user_question": question,
@@ -1045,12 +1370,12 @@ Instructions:
1045
  "retry_count": 0,
1046
  "max_retries": self.max_retries
1047
  }
1048
-
1049
  config = {
1050
  "configurable": {"thread_id": thread_id},
1051
  "recursion_limit": 50 # FIXED: Increased from default 25
1052
  }
1053
-
1054
  for event in self.graph.stream(initial_state, config):
1055
  yield event
1056
 
@@ -1062,37 +1387,37 @@ Instructions:
1062
  def main():
1063
  """Interactive CLI"""
1064
  import sys
1065
-
1066
  openai_api_key = os.getenv("OPENAI_API_KEY")
1067
  pinecone_api_key = os.getenv("PINECONE_API_KEY")
1068
-
1069
  if not openai_api_key or not pinecone_api_key:
1070
  print("Error: Missing API keys")
1071
  print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
1072
  sys.exit(1)
1073
-
1074
  print("Initializing survey analysis agent...")
1075
  agent = SurveyAnalysisAgent(
1076
  openai_api_key=openai_api_key,
1077
  pinecone_api_key=pinecone_api_key,
1078
  verbose=True
1079
  )
1080
-
1081
  print("\n" + "="*80)
1082
  print("SURVEY ANALYSIS AGENT (WITH STAGED RESEARCH)")
1083
  print("="*80)
1084
  print("\nType 'quit' to exit\n")
1085
-
1086
  thread_id = "cli_session"
1087
-
1088
  while True:
1089
  try:
1090
  question = input("\nYour question: ").strip()
1091
-
1092
  if not question or question.lower() in ['quit', 'exit', 'q']:
1093
  print("\nGoodbye!")
1094
  break
1095
-
1096
  print("\n" + "-"*80)
1097
  answer = agent.query(question, thread_id=thread_id)
1098
  print("\n" + "="*80)
@@ -1100,7 +1425,7 @@ def main():
1100
  print("="*80)
1101
  print(answer)
1102
  print("="*80)
1103
-
1104
  except KeyboardInterrupt:
1105
  print("\n\nGoodbye!")
1106
  break
@@ -1111,4 +1436,4 @@ def main():
1111
 
1112
 
1113
  if __name__ == "__main__":
1114
- main()
 
13
 
14
  Current Status:
15
  - ✅ Questionnaire pipeline: ACTIVE
16
+ - Toplines pipeline: ACTIVE
17
+ - Crosstabs pipeline: ACTIVE
18
  - ⏳ SQL pipeline: Not yet implemented
19
  """
20
 
 
32
 
33
  # Import the questionnaire RAG
34
  from questionnaire_rag import QuestionnaireRAG
35
+ from toplines_rag import ToplinesRAG
36
+ from crosstab_rag import CrosstabsRAG
37
+
38
+
39
+ # ============================================================================
40
+ # PROMPT LOADING UTILITIES
41
+ # ============================================================================
42
+
43
+ def _load_prompt_file(filename: str) -> str:
44
+ """Load a prompt file from the prompts directory"""
45
+ prompt_dir = Path(__file__).parent / "prompts"
46
+ prompt_path = prompt_dir / filename
47
+ if not prompt_path.exists():
48
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
49
+ return prompt_path.read_text(encoding="utf-8")
50
 
51
  try:
52
  from dotenv import load_dotenv
 
62
  class QueryFilters(BaseModel):
63
  """Filters for data source queries - Pydantic v2 with strict schema"""
64
  model_config = ConfigDict(extra="forbid")
65
+
66
  year: Optional[int] = Field(default=None, description="Year filter (e.g., 2025)")
67
  month: Optional[str] = Field(default=None, description="Month filter (e.g., 'February')")
68
  poll_date: Optional[str] = Field(default=None, description="Specific poll date (e.g., '2025-02-15')")
 
74
  class DataSource(BaseModel):
75
  """Represents a data source to query"""
76
  model_config = ConfigDict(extra="forbid")
77
+
78
  source_type: Literal["questionnaire", "toplines", "crosstabs", "sql"]
79
  query_description: str = Field(description="What to retrieve from this source")
80
  filters: QueryFilters = Field(default_factory=QueryFilters, description="Filters to apply")
 
84
  class ResearchStage(BaseModel):
85
  """A single stage in a multi-stage research plan"""
86
  model_config = ConfigDict(extra="forbid")
87
+
88
  stage_number: int = Field(description="Stage number (1-indexed)")
89
  description: str = Field(description="What this stage accomplishes")
90
  data_sources: List[DataSource] = Field(description="Data sources to query in this stage")
91
  depends_on_stages: List[int] = Field(default_factory=list, description="Which prior stages this depends on")
92
  use_previous_results_for: Optional[str] = Field(
93
+ default=None,
94
  description="How to use previous stage results (e.g., 'Extract question IDs from stage 1')"
95
  )
96
 
 
98
  class ResearchBrief(BaseModel):
99
  """Research brief - can be either single-stage or multi-stage"""
100
  model_config = ConfigDict(extra="forbid")
101
+
102
  action: Literal["answer", "followup", "route_to_sources", "execute_stages"]
103
  followup_question: Optional[str] = Field(default=None, description="Follow-up question to ask user")
104
  reasoning: str = Field(description="Why this approach was chosen")
105
+
106
  # For simple queries (single-stage)
107
  data_sources: List[DataSource] = Field(default_factory=list, description="Data sources for simple queries")
108
+
109
  # For complex queries (multi-stage)
110
  stages: List[ResearchStage] = Field(default_factory=list, description="Ordered stages of research")
111
 
 
113
  class StageResult(BaseModel):
114
  """Results from executing one stage"""
115
  model_config = ConfigDict(extra="forbid")
116
+
117
  stage_number: int
118
  status: Literal["success", "partial", "failed"]
119
  questionnaire_results: Optional[Dict[str, Any]] = None
 
129
  class VerificationResult(BaseModel):
130
  """Result of verifying if data answers the question"""
131
  model_config = ConfigDict(extra="forbid")
132
+
133
  answers_question: bool = Field(description="Whether the data fully answers the question")
134
  missing_info: Optional[str] = Field(default=None, description="What information is missing")
135
  improvement_suggestion: Optional[str] = Field(default=None, description="How to improve the research brief")
 
140
  # User interaction
141
  messages: Annotated[List, operator.add]
142
  user_question: str
143
+
144
  # Planning
145
  research_brief: Optional[ResearchBrief]
146
+
147
  # Stage execution
148
  current_stage: int # Which stage we're executing (0-indexed internally, but 1-indexed in models)
149
  stage_results: List[StageResult] # Results from each completed stage
150
+
151
  # Legacy single-stage results (for backward compatibility)
152
  questionnaire_results: Optional[Dict[str, Any]]
153
  toplines_results: Optional[Dict[str, Any]]
154
  crosstabs_results: Optional[Dict[str, Any]]
155
  sql_results: Optional[Dict[str, Any]]
156
+
157
  # Verification & synthesis
158
  verification: Optional[VerificationResult]
159
  final_answer: Optional[str]
160
+
161
  # Control flow
162
  retry_count: int
163
  max_retries: int
 
170
  class SurveyAnalysisAgent:
171
  """
172
  Multi-agent system for analyzing survey data with staged research briefs.
173
+
174
  Flow:
175
  1. User asks question
176
  2. Research brief agent decides: simple (one-shot) or complex (staged)
 
178
  4. For complex: execute stages sequentially, each using previous results
179
  5. Final synthesis combines all stage results
180
  """
181
+
182
  # TODO: REMOVE WHEN PIPELINES READY - START
183
  # Track which pipelines are currently available
184
+ AVAILABLE_PIPELINES = {"questionnaire", "toplines", "crosstabs"} # Add "sql" as it becomes ready
185
  # TODO: REMOVE WHEN PIPELINES READY - END
186
+
187
  def __init__(
188
  self,
189
  openai_api_key: str,
 
196
  self.pinecone_api_key = pinecone_api_key
197
  self.verbose = verbose
198
  self.max_retries = max_retries
199
+
200
  # Initialize LLM
201
  self.llm = ChatOpenAI(
202
  model=os.getenv("OPENAI_MODEL", "gpt-4o"),
203
  temperature=0
204
  )
205
+
206
  # Initialize questionnaire RAG
207
  if self.verbose:
208
  print("Initializing questionnaire RAG system...")
 
212
  persist_directory=questionnaire_persist_dir,
213
  verbose=verbose
214
  )
215
+
216
+ # Initialize toplines RAG
217
+ if self.verbose:
218
+ print("Initializing toplines RAG system...")
219
+ self.toplines_rag = ToplinesRAG()
220
+
221
+ # Initialize crosstabs RAG (pass questionnaire_rag to reuse question matching)
222
+ if self.verbose:
223
+ print("Initializing crosstabs RAG system...")
224
+ self.crosstab_rag = CrosstabsRAG(questionnaire_rag=self.questionnaire_rag, verbose=self.verbose)
225
+
226
  # Build the graph
227
  self.graph = self._build_graph()
228
+
229
  if self.verbose:
230
  print("✓ Survey analysis agent initialized with staged research capability")
231
+
232
  def _build_graph(self) -> StateGraph:
233
  """Build the LangGraph workflow with staged research support"""
234
+
235
  workflow = StateGraph(SurveyAnalysisState)
236
+
237
  # Add nodes
238
  workflow.add_node("generate_research_brief", self._generate_research_brief)
239
  workflow.add_node("execute_stage", self._execute_stage)
240
  workflow.add_node("extract_stage_context", self._extract_stage_context)
241
  workflow.add_node("verify_results", self._verify_results)
242
  workflow.add_node("synthesize_response", self._synthesize_response)
243
+
244
  # Define edges
245
  workflow.add_edge(START, "generate_research_brief")
246
+
247
  # After research brief, route based on action
248
  workflow.add_conditional_edges(
249
  "generate_research_brief",
 
254
  "execute_stage": "execute_stage"
255
  }
256
  )
257
+
258
  # After stage execution, extract context for next stage
259
  workflow.add_edge("execute_stage", "extract_stage_context")
260
+
261
  # After context extraction, decide next step
262
  workflow.add_conditional_edges(
263
  "extract_stage_context",
 
267
  "verify": "verify_results" # All stages done, verify
268
  }
269
  )
270
+
271
  # After verification, decide next step
272
  workflow.add_conditional_edges(
273
  "verify_results",
 
278
  "give_up": "synthesize_response"
279
  }
280
  )
281
+
282
  # End after synthesis
283
  workflow.add_edge("synthesize_response", END)
284
+
285
  # Compile with memory
286
  memory = MemorySaver()
287
  return workflow.compile(checkpointer=memory)
288
+
289
  def _get_available_surveys_description(self) -> str:
290
  """Get formatted description of available surveys for LLM prompt"""
291
  survey_names = self.questionnaire_rag.get_available_survey_names()
292
+
293
  if not survey_names:
294
  return "No surveys currently loaded."
295
+
296
  lines = ["Available survey names in the system:"]
297
  for name in survey_names:
298
  lines.append(f" - '{name}'")
299
+
300
+ return "\n".join(lines)
301
+
302
+ def _get_available_months_for_year(self, year: int, survey_name: str = "Vanderbilt_Unity_Poll") -> List[str]:
303
+ """Get list of available months for a given year and survey, sorted chronologically"""
304
+ month_order = ["January", "February", "March", "April", "May", "June",
305
+ "July", "August", "September", "October", "November", "December"]
306
+ months = []
307
+ catalog = self.questionnaire_rag.poll_catalog
308
+ for poll_date, info in catalog.items():
309
+ if info.get("year") == year and info.get("survey_name") == survey_name:
310
+ month = info.get("month")
311
+ if month:
312
+ months.append(month)
313
+ # Sort chronologically
314
+ return sorted(months, key=lambda m: month_order.index(m) if m in month_order else 999)
315
+
316
+ def _get_available_months_description(self) -> str:
317
+ """Get formatted description of available months by year for LLM prompt"""
318
+ month_order = ["January", "February", "March", "April", "May", "June",
319
+ "July", "August", "September", "October", "November", "December"]
320
+ catalog = self.questionnaire_rag.poll_catalog
321
+ years = {}
322
+ for poll_date, info in catalog.items():
323
+ year = info.get("year")
324
+ month = info.get("month")
325
+ survey = info.get("survey_name")
326
+ if year and month and survey == "Vanderbilt_Unity_Poll":
327
+ if year not in years:
328
+ years[year] = []
329
+ if month not in years[year]:
330
+ years[year].append(month)
331
+
332
+ lines = ["Available polls by year (Vanderbilt Unity Poll):"]
333
+ for year in sorted(years.keys()):
334
+ # Sort months chronologically
335
+ months_sorted = sorted(years[year], key=lambda m: month_order.index(m) if m in month_order else 999)
336
+ months_str = ", ".join(months_sorted)
337
+ lines.append(f" {year}: {months_str}")
338
 
339
  return "\n".join(lines)
340
+
341
  # TODO: REMOVE WHEN PIPELINES READY - START
342
  def _get_pipeline_status_description(self) -> str:
343
  """Get description of available vs unavailable pipelines"""
 
347
  "crosstabs": "Pre-computed cross-tabulations by demographics",
348
  "sql": "Raw survey responses for custom analysis"
349
  }
350
+
351
  lines = []
352
  for pipeline, description in all_pipelines.items():
353
  status = "✅ AVAILABLE" if pipeline in self.AVAILABLE_PIPELINES else "❌ NOT YET AVAILABLE"
354
  lines.append(f"{pipeline.capitalize()}: {description} {status}")
355
+
356
  return "\n".join(lines)
357
  # TODO: REMOVE WHEN PIPELINES READY - END
358
+
359
  def _get_full_question_context(self, state: SurveyAnalysisState) -> str:
360
  """
361
  Build full question context from conversation history.
362
+
363
  IMPORTANT: Only look at the LATEST user message for the current query.
364
  Previous messages provide context but the latest message is what we're answering.
365
  """
366
  messages = state.get("messages", [])
367
+
368
  # Extract all human messages
369
  human_messages = []
370
  for msg in messages:
371
  if isinstance(msg, HumanMessage):
372
  human_messages.append(msg.content)
373
+
374
  if not human_messages:
375
  return state["user_question"]
376
+
377
  # For planning, just use the latest message
378
  # Don't combine with previous messages as that causes misinterpretation
379
  latest_message = human_messages[-1]
380
+
381
  if self.verbose:
382
  print(f"📝 Conversation history: {len(human_messages)} user message(s)")
383
  for i, msg in enumerate(human_messages, 1):
384
  print(f" {i}. {msg[:100]}..." if len(msg) > 100 else f" {i}. {msg}")
385
  print(f"🎯 Answering latest: {latest_message}")
386
+
387
  return latest_message
388
 
389
+
390
  # ========================================================================
391
  # NODE FUNCTIONS
392
  # ========================================================================
393
+
394
  def _generate_research_brief(self, state: SurveyAnalysisState) -> Dict[str, Any]:
395
  """Generate research brief - decides single-stage vs multi-stage approach"""
396
+
397
  if self.verbose:
398
  print("\n=== GENERATING RESEARCH BRIEF ===")
399
+
400
+ # Get conversation history for context
401
+ messages = state.get("messages", [])
402
+ human_messages = [msg.content for msg in messages if isinstance(msg, HumanMessage)]
403
 
404
+ # Get the current question (latest message)
405
  question = self._get_full_question_context(state)
406
 
407
+ # Build conversation context for the prompt
408
+ conversation_context = ""
409
+ if len(human_messages) > 1:
410
+ conversation_context = "\n\nCONVERSATION HISTORY (for context):\n"
411
+ previous_questions = []
412
+ for i, msg in enumerate(human_messages[:-1], 1): # All except the latest
413
+ conversation_context += f" {i}. {msg}\n"
414
+ previous_questions.append(msg)
415
+
416
+ conversation_context += f"\nCurrent question: {question}\n"
417
+
418
+ # Check if current question is a short answer (like "June", "February", "2024")
419
+ # This suggests it's answering a previous followup question
420
+ is_short_answer = len(question.split()) <= 2 and any(word.lower() in ['june', 'february', 'march', 'april', 'may', 'july', 'august', 'september', 'october', 'november', 'december', 'january'] or word.isdigit() for word in question.split())
421
+
422
+ if is_short_answer and previous_questions:
423
+ # Reconstruct the original intent from the previous question
424
+ original_question = previous_questions[-1]
425
+ conversation_context += f"\n🚨 IMPORTANT: The current question '{question}' is a SHORT ANSWER to a previous followup.\n"
426
+ conversation_context += f"Original question was: '{original_question}'\n"
427
+ conversation_context += f"You MUST combine '{question}' with the original intent from '{original_question}'.\n"
428
+ conversation_context += "- If original question asked about 'approval', 'ratings', 'responses', 'percentages' → use TOPLINES pipeline\n"
429
+ conversation_context += "- If original question asked about 'questions', 'what was asked' → use QUESTIONNAIRE pipeline\n"
430
+ conversation_context += "- If original question asked about 'vary by', 'breakdown by', 'by gender/age/etc' → use CROSSTABS pipeline\n"
431
+ conversation_context += "- Extract year/month from the short answer and combine with original question's intent\n"
432
+ conversation_context += f"- Example: Original='Trump's approval in 2025?', Current='June' → Query: 'Trump's approval in June 2025' using TOPLINES\n"
433
+
434
+ conversation_context += "\n⚠️ CRITICAL: Use conversation history to infer intent:\n"
435
+ conversation_context += "- If user says 'these questions', 'for each of these', 'all of them', etc., they're referencing previous results\n"
436
+ conversation_context += "- Infer time periods (year, month) from previous USER QUESTIONS if current question doesn't specify\n"
437
+ conversation_context += "- Example: If previous question was 'what questions about economy in 2025?', infer that 2025 economy questions were retrieved\n"
438
+ conversation_context += "- For 2025, available months are: February, June (from the available polls description)\n"
439
+ conversation_context += "- When user asks 'how do responses vary by X for each of these questions', create stages for EACH available month from the previous question's time period\n"
440
+ conversation_context += "- If previous question mentioned a year (e.g., 'in 2025'), use ALL available months for that year in the current analysis\n"
441
+ conversation_context += "- DO NOT ask followup for month/year if you can infer it from the previous user question\n"
442
+
443
  retry_count = state.get("retry_count", 0)
444
+
445
  # Add context from verification if this is a retry
446
  verification_context = ""
447
  if state.get("verification") and retry_count > 0:
 
452
 
453
  Please improve the research plan based on this feedback.
454
  """
455
+
456
+ # Load research brief prompt from file
457
+ system_prompt_template = _load_prompt_file("research_brief_prompt.txt")
458
+ system_prompt = system_prompt_template.format(
459
+ available_pipelines=self._get_pipeline_status_description(),
460
+ available_surveys=self._get_available_surveys_description(),
461
+ available_months=self._get_available_months_description(),
462
+ verification_context=verification_context
463
+ )
464
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
465
  brief_generator = self.llm.with_structured_output(ResearchBrief)
466
+
467
+ user_prompt = f"User question: {question}\n\nGenerate a research brief."
468
+ if conversation_context:
469
+ user_prompt = conversation_context + "\n\n" + user_prompt
470
+
471
  brief = brief_generator.invoke([
472
  SystemMessage(content=system_prompt),
473
+ HumanMessage(content=user_prompt)
474
  ])
475
+
476
  if self.verbose:
477
  print(f"Action: {brief.action}")
478
  print(f"Reasoning: {brief.reasoning}")
479
+
480
  if brief.followup_question:
481
  print(f"Follow-up: {brief.followup_question}")
482
+
483
  if brief.action == "route_to_sources" and brief.data_sources:
484
  print(f"Simple query - {len(brief.data_sources)} data sources")
485
  for ds in brief.data_sources:
 
487
  print(f" - {ds.source_type}: {ds.query_description}")
488
  if filters_dict:
489
  print(f" Filters: {filters_dict}")
490
+
491
  if brief.action == "execute_stages" and brief.stages:
492
  print(f"Staged query - {len(brief.stages)} stages")
493
  for stage in brief.stages:
 
500
  print(f" - {ds.source_type}: {ds.query_description}")
501
  if ds.result_label:
502
  print(f" Label: {ds.result_label}")
503
+
504
  return {
505
  "research_brief": brief,
506
  "current_stage": 0, # Start at stage 0 (will execute stage 1 first)
507
  "stage_results": [],
508
  "messages": [AIMessage(content=f"[Research plan: {brief.action}]")]
509
  }
510
+
511
  def _route_after_brief(self, state: SurveyAnalysisState) -> str:
512
  """Route based on research brief action"""
513
  brief = state["research_brief"]
514
+
515
  if brief.action == "followup":
516
  return "followup"
517
  elif brief.action == "answer":
 
520
  return "execute_stage"
521
  else: # route_to_sources
522
  return "execute_stage" # We'll handle both single and staged in execute_stage
523
+
524
  def _execute_stage(self, state: SurveyAnalysisState) -> Dict[str, Any]:
525
  """Execute one stage of research (handles both single-stage and multi-stage)"""
526
+
527
  brief = state["research_brief"]
528
  current_stage_idx = state.get("current_stage", 0)
529
  previous_stage_results = state.get("stage_results", [])
530
+
531
  # Determine if this is single-stage or multi-stage
532
  if brief.action == "route_to_sources":
533
  # Single-stage: use data_sources directly
534
  if self.verbose:
535
  print(f"\n=== EXECUTING SINGLE-STAGE RESEARCH ===")
536
+
537
  stage_data_sources = brief.data_sources
538
  stage_desc = "Single-stage retrieval"
539
+
540
  elif brief.action == "execute_stages":
541
  # Multi-stage: get current stage
542
  stage = brief.stages[current_stage_idx]
543
+
544
  if self.verbose:
545
  print(f"\n=== EXECUTING STAGE {stage.stage_number}/{len(brief.stages)} ===")
546
  print(f"Description: {stage.description}")
547
+
548
  stage_data_sources = stage.data_sources
549
  stage_desc = stage.description
550
+
551
  # If this stage depends on previous stages, enrich filters with context
552
  if stage.use_previous_results_for and previous_stage_results:
553
  stage_data_sources = self._enrich_data_sources_with_context(
 
557
  )
558
  else:
559
  return {}
560
+
561
  # Execute pipelines for this stage
562
  stage_result = StageResult(
563
  stage_number=current_stage_idx + 1,
564
  status="success"
565
  )
566
+
567
  # TODO: REMOVE WHEN PIPELINES READY - Track what was attempted vs available
568
  attempted_pipelines = []
569
  unavailable_pipelines = []
570
+
571
  # Run each pipeline
572
  for ds in stage_data_sources:
573
  filters_dict = {k: v for k, v in ds.filters.model_dump().items() if v is not None}
574
+
575
  # TODO: REMOVE WHEN PIPELINES READY - START
576
  attempted_pipelines.append(ds.source_type)
577
  # TODO: REMOVE WHEN PIPELINES READY - END
578
+
579
  if ds.source_type == "questionnaire":
580
  if self.verbose:
581
+ print(f"\n📊 [Questionnaire Pipeline] Retrieving raw data from QuestionnaireRAG vectorstore")
582
+ print(f" Query: {ds.query_description}")
583
  if filters_dict:
584
+ print(f" Filters: {filters_dict}")
585
+
586
+ result = self.questionnaire_rag.retrieve_raw_data(
587
  question=ds.query_description,
588
  filters=filters_dict if filters_dict else None
589
  )
590
+
591
  # Store with label if provided
592
  if ds.result_label:
593
  result["label"] = ds.result_label
594
+
595
  stage_result.questionnaire_results = result if stage_result.questionnaire_results is None else {
596
  "multiple": True,
597
  "results": [stage_result.questionnaire_results, result]
598
  }
599
+
600
  if self.verbose:
601
+ print(f"✅ [Questionnaire Pipeline] Retrieved {result['num_sources']} question(s) from QuestionnaireRAG vectorstore")
602
+ if result.get("source_questions"):
603
+ question_vars = [q.get("variable_name", "unknown") for q in result["source_questions"][:3]]
604
+ print(f" Variables: {', '.join(question_vars)}{' ...' if len(result['source_questions']) > 3 else ''}")
605
+
606
  # TODO: REMOVE WHEN PIPELINES READY - START
607
  elif ds.source_type not in self.AVAILABLE_PIPELINES:
608
  unavailable_pipelines.append(ds.source_type)
 
610
  print(f"\n⚠️ {ds.source_type.upper()} pipeline not yet available - skipping")
611
  print(f" Requested: {ds.query_description}")
612
  # TODO: REMOVE WHEN PIPELINES READY - END
613
+
614
+ # Toplines pipeline implementation (now available)
615
+ elif ds.source_type == "toplines":
616
+ if self.verbose:
617
+ print(f"\n📊 [Toplines Pipeline] Retrieving raw data from ToplinesRAG vectorstore")
618
+ print(f" Query: {ds.query_description}")
619
+ if filters_dict:
620
+ print(f" Filters: {filters_dict}")
621
+
622
+ try:
623
+ # Retrieve raw data without LLM synthesis
624
+ toplines_result = self.toplines_rag.retrieve_raw_data(
625
+ query=ds.query_description,
626
+ filters=filters_dict if filters_dict else None,
627
+ top_k=10
628
+ )
629
+
630
+ if ds.result_label:
631
+ toplines_result["label"] = ds.result_label
632
+
633
+ stage_result.toplines_results = toplines_result if stage_result.toplines_results is None else {
634
+ "multiple": True,
635
+ "results": [stage_result.toplines_results, toplines_result]
636
+ }
637
+
638
+ if self.verbose:
639
+ print(f"✅ [Toplines Pipeline] Retrieved {toplines_result['num_sources']} topline document(s) from ToplinesRAG vectorstore")
640
+ except Exception as e:
641
+ if self.verbose:
642
+ print(f"⚠️ Error querying toplines: {e}")
643
+ stage_result.status = "partial"
644
+ if not stage_result.extracted_context:
645
+ stage_result.extracted_context = {}
646
+ stage_result.extracted_context["toplines_error"] = str(e)
647
+
648
+ # Crosstabs pipeline implementation (now available)
649
+ elif ds.source_type == "crosstabs":
650
+ if self.verbose:
651
+ print(f"\n📊 [Crosstabs Pipeline] Retrieving raw data from CrosstabsRAG")
652
+ print(f" Query description: {ds.query_description}")
653
+ if filters_dict:
654
+ print(f" Filters: {filters_dict}")
655
+
656
+ try:
657
+ # Build query string from query_description and filters
658
+ # CrosstabsRAG.retrieve_raw_data() expects a full query string that includes year/month/poll info
659
+ query_text = ds.query_description
660
+
661
+ # Enhance query with filter information if available
662
+ if filters_dict:
663
+ filter_parts = []
664
+ if "year" in filters_dict:
665
+ filter_parts.append(str(filters_dict["year"]))
666
+ if "month" in filters_dict:
667
+ filter_parts.append(filters_dict["month"])
668
+ if "survey_name" in filters_dict:
669
+ filter_parts.append(filters_dict["survey_name"])
670
+ if filter_parts:
671
+ query_text = f"{query_text} {' '.join(filter_parts)}"
672
+
673
+ # Retrieve raw data without LLM summarization
674
+ crosstab_result = self.crosstab_rag.retrieve_raw_data(user_query=query_text, filters=filters_dict)
675
+
676
+ # Handle error response
677
+ if "error" in crosstab_result:
678
+ if self.verbose:
679
+ print(f"⚠️ [Crosstabs Pipeline] Query error: {crosstab_result['error']}")
680
+ stage_result.status = "partial"
681
+ if not stage_result.extracted_context:
682
+ stage_result.extracted_context = {}
683
+ stage_result.extracted_context["crosstabs_error"] = crosstab_result["error"]
684
+ else:
685
+ # Success - store raw data with label if provided
686
+ if ds.result_label:
687
+ crosstab_result["label"] = ds.result_label
688
+
689
+ stage_result.crosstabs_results = crosstab_result if stage_result.crosstabs_results is None else {
690
+ "multiple": True,
691
+ "results": [stage_result.crosstabs_results, crosstab_result]
692
+ }
693
+
694
+ if self.verbose:
695
+ matched_vars = crosstab_result.get("matched_variables", [])
696
+ namespace = crosstab_result.get("namespace_used", "unknown")
697
+ num_questions = len(matched_vars)
698
+ print(f"✅ [Crosstabs Pipeline] Complete")
699
+ print(f" Matched variables: {', '.join(matched_vars[:3])}{' ...' if len(matched_vars) > 3 else ''}")
700
+ print(f" Namespace: {namespace}")
701
+ print(f" Questions with crosstab data: {num_questions}")
702
+ except Exception as e:
703
+ if self.verbose:
704
+ print(f"⚠️ Error querying crosstabs: {e}")
705
+ stage_result.status = "partial"
706
+ if not stage_result.extracted_context:
707
+ stage_result.extracted_context = {}
708
+ stage_result.extracted_context["crosstabs_error"] = str(e)
709
+
710
  # TODO: REMOVE WHEN PIPELINES READY - START
711
  # Add a note about unavailable pipelines to the stage result
712
  if unavailable_pipelines:
 
718
  stage_result.extracted_context = {}
719
  stage_result.extracted_context["unavailable_pipelines"] = unavailable_pipelines
720
  # TODO: REMOVE WHEN PIPELINES READY - END
721
+
722
  # Add stage result to list
723
  updated_stage_results = previous_stage_results + [stage_result]
724
+
725
  # For single-stage, also populate legacy fields
726
  if brief.action == "route_to_sources":
727
  return {
 
732
  "crosstabs_results": stage_result.crosstabs_results,
733
  "sql_results": stage_result.sql_results
734
  }
735
+
736
  return {
737
  "stage_results": updated_stage_results,
738
  "current_stage": current_stage_idx + 1 # FIXED: Increment stage counter
739
  }
740
+
741
  def _enrich_data_sources_with_context(
742
  self,
743
  data_sources: List[DataSource],
 
745
  use_instruction: str
746
  ) -> List[DataSource]:
747
  """Enrich data sources with context from previous stages"""
748
+
749
  if self.verbose:
750
  print(f" Enriching with context: {use_instruction}")
751
+
752
  # For now, handle the most common case: extracting question IDs
753
  if "question" in use_instruction.lower() and "id" in use_instruction.lower():
754
  # Extract question IDs from previous questionnaire results
 
758
  q_results = prev_result.questionnaire_results
759
  if "source_questions" in q_results:
760
  question_ids.extend([q.get("question_id") for q in q_results["source_questions"]])
761
+
762
  if question_ids and self.verbose:
763
  print(f" Found {len(question_ids)} question IDs from previous stages")
764
+
765
  # Add question_ids to filters
766
  enriched_sources = []
767
  for ds in data_sources:
768
  new_filters = ds.filters.model_copy()
769
  new_filters.question_ids = question_ids if question_ids else None
770
+
771
  enriched_ds = ds.model_copy()
772
  enriched_ds.filters = new_filters
773
  enriched_sources.append(enriched_ds)
774
+
775
  return enriched_sources
776
+
777
  return data_sources
778
+
779
  def _extract_stage_context(self, state: SurveyAnalysisState) -> Dict[str, Any]:
780
  """Extract key context from completed stage for use in next stages"""
781
+
782
  stage_results = state.get("stage_results", [])
783
  if not stage_results:
784
  return {}
785
+
786
  current_result = stage_results[-1]
787
+
788
  # Extract question IDs if questionnaire results exist
789
  extracted_context = {}
790
+
791
  if current_result.questionnaire_results:
792
  q_results = current_result.questionnaire_results
793
  if "source_questions" in q_results:
794
  question_ids = [q.get("question_id") for q in q_results["source_questions"]]
795
  extracted_context["question_ids"] = question_ids
796
+
797
  if self.verbose:
798
  print(f"\n=== EXTRACTED CONTEXT FROM STAGE {current_result.stage_number} ===")
799
  print(f"Question IDs: {len(question_ids)} extracted")
800
+
801
  # Update the stage result with extracted context
802
  current_result.extracted_context = extracted_context
803
+
804
  return {}
805
+
806
  def _route_after_stage(self, state: SurveyAnalysisState) -> str:
807
  """Decide if we need to execute another stage or move to verification"""
808
+
809
  brief = state["research_brief"]
810
  current_stage_idx = state.get("current_stage", 0)
811
+
812
  # Single-stage query
813
  if brief.action == "route_to_sources":
814
  if self.verbose:
815
  print("\n=== SINGLE-STAGE COMPLETE → VERIFICATION ===")
816
  return "verify"
817
+
818
  # Multi-stage query
819
  total_stages = len(brief.stages)
820
+
821
  # FIXED: Don't add 1 here since current_stage was already incremented
822
  if current_stage_idx < total_stages:
823
  if self.verbose:
 
827
  if self.verbose:
828
  print(f"\n=== ALL {total_stages} STAGES COMPLETE → VERIFICATION ===")
829
  return "verify"
830
+
831
  def _verify_results(self, state: SurveyAnalysisState) -> Dict[str, Any]:
832
  """Verify that retrieved data answers the question"""
833
+
834
  if self.verbose:
835
  print("\n=== VERIFYING RESULTS ===")
836
+
837
  # Use the latest question only
838
  question = self._get_full_question_context(state)
839
+
840
  stage_results = state.get("stage_results", [])
841
  brief = state["research_brief"]
842
+
843
+ # Build detailed summary of raw data retrieved
844
  retrieval_summary = []
845
+ raw_data_details = []
846
+ filters_applied_list = []
847
  total_questions = 0
848
+ total_toplines_docs = 0
849
+ total_crosstab_variables = 0
850
+
851
  # TODO: REMOVE WHEN PIPELINES READY - START
852
  unavailable_pipelines_found = []
853
  # TODO: REMOVE WHEN PIPELINES READY - END
854
+
855
  for stage_result in stage_results:
856
+ stage_num = stage_result.stage_number
857
+
858
+ # Analyze questionnaire raw data
859
  if stage_result.questionnaire_results:
860
  q_res = stage_result.questionnaire_results
861
+ # Handle multiple results
862
+ if isinstance(q_res, dict) and q_res.get("multiple"):
863
+ all_q_results = q_res.get("results", [])
864
+ else:
865
+ all_q_results = [q_res]
866
+
867
+ for q_result in all_q_results:
868
+ num = q_result.get("num_sources", 0)
869
+ total_questions += num
870
+ source_questions = q_result.get("source_questions", [])
871
+ filters_applied = q_result.get("filters_applied", {})
872
+
873
+ # Collect filters applied
874
+ if filters_applied:
875
+ filters_applied_list.append(f"Stage {stage_num} Questionnaire: {filters_applied}")
876
+
877
+ if num > 0:
878
+ # Extract key details about retrieved questions
879
+ question_topics = set()
880
+ question_vars = []
881
+ question_years = set()
882
+ question_months = set()
883
+ for q in source_questions[:5]: # Sample first 5
884
+ topics = q.get("topics", [])
885
+ if isinstance(topics, str):
886
+ topics = [t.strip() for t in topics.split(",")]
887
+ question_topics.update(topics)
888
+ question_vars.append(q.get("variable_name", "unknown"))
889
+ if q.get("year"):
890
+ question_years.add(str(q.get("year")))
891
+ if q.get("month"):
892
+ question_months.add(q.get("month"))
893
+
894
+ topics_str = ", ".join(sorted(question_topics)[:5])
895
+ vars_str = ", ".join(question_vars[:3])
896
+ years_str = ", ".join(sorted(question_years)) if question_years else "unknown"
897
+ months_str = ", ".join(sorted(question_months)) if question_months else "unknown"
898
+
899
+ time_period_info = f"{months_str} {years_str}" if months_str != "unknown" else years_str
900
+
901
+ retrieval_summary.append(
902
+ f"Stage {stage_num}: {num} questionnaire question(s) "
903
+ f"(time period: {time_period_info}, variables: {vars_str}{'...' if num > 3 else ''}, topics: {topics_str[:50]})"
904
+ )
905
+
906
+ raw_data_details.append(
907
+ f"Stage {stage_num} Questionnaire Data:\n"
908
+ f"- Number of questions: {num}\n"
909
+ f"- Time period: {time_period_info}\n"
910
+ f"- Sample variables: {vars_str}\n"
911
+ f"- Topics covered: {topics_str[:100]}\n"
912
+ )
913
+ else:
914
+ retrieval_summary.append(f"Stage {stage_num}: No questionnaire questions retrieved")
915
+
916
+ # Analyze toplines raw data
917
+ if stage_result.toplines_results:
918
+ t_res = stage_result.toplines_results
919
+ # Handle multiple results
920
+ if isinstance(t_res, dict) and t_res.get("multiple"):
921
+ all_t_results = t_res.get("results", [])
922
+ else:
923
+ all_t_results = [t_res]
924
+
925
+ for t_result in all_t_results:
926
+ num_docs = t_result.get("num_sources", 0)
927
+ total_toplines_docs += num_docs
928
+ retrieved_docs = t_result.get("retrieved_docs", [])
929
+
930
+ if num_docs > 0:
931
+ # Extract key details from toplines documents
932
+ sample_vars = []
933
+ sample_responses = []
934
+ for doc in retrieved_docs[:3]:
935
+ var = doc.metadata.get("variable_name", "unknown")
936
+ response = doc.metadata.get("response_label", "")
937
+ sample_vars.append(var)
938
+ if response:
939
+ sample_responses.append(response[:30])
940
+
941
+ vars_str = ", ".join(sample_vars)
942
+ retrieval_summary.append(
943
+ f"Stage {stage_num}: {num_docs} topline document(s) "
944
+ f"(variables: {vars_str}{'...' if num_docs > 3 else ''})"
945
+ )
946
+
947
+ raw_data_details.append(
948
+ f"Stage {stage_num} Toplines Data:\n"
949
+ f"- Number of documents: {num_docs}\n"
950
+ f"- Sample variables: {vars_str}\n"
951
+ f"- Sample responses: {', '.join(sample_responses[:3])}\n"
952
+ )
953
+ else:
954
+ retrieval_summary.append(f"Stage {stage_num}: No topline documents retrieved")
955
+
956
+ # Analyze crosstabs raw data
957
+ if stage_result.crosstabs_results:
958
+ c_res = stage_result.crosstabs_results
959
+ # Handle multiple results
960
+ if isinstance(c_res, dict) and c_res.get("multiple"):
961
+ all_c_results = c_res.get("results", [])
962
+ else:
963
+ all_c_results = [c_res]
964
+
965
+ for c_result in all_c_results:
966
+ if "error" in c_result:
967
+ retrieval_summary.append(f"Stage {stage_num}: Crosstabs error - {c_result['error']}")
968
+ continue
969
+
970
+ crosstab_docs_by_var = c_result.get("crosstab_docs_by_variable", {})
971
+ matched_vars = c_result.get("matched_variables", [])
972
+ total_crosstab_variables += len(matched_vars)
973
+
974
+ if matched_vars:
975
+ vars_str = ", ".join(matched_vars[:3])
976
+ namespace = c_result.get("namespace_used", "unknown")
977
+ retrieval_summary.append(
978
+ f"Stage {stage_num}: {len(matched_vars)} crosstab variable(s) "
979
+ f"(variables: {vars_str}{'...' if len(matched_vars) > 3 else ''}, namespace: {namespace})"
980
+ )
981
+
982
+ # Count total crosstab documents
983
+ total_crosstab_docs = sum(
984
+ len(var_data.get("crosstab_docs", []))
985
+ for var_data in crosstab_docs_by_var.values()
986
+ )
987
+
988
+ raw_data_details.append(
989
+ f"Stage {stage_num} Crosstabs Data:\n"
990
+ f"- Number of variables: {len(matched_vars)}\n"
991
+ f"- Variables: {vars_str}\n"
992
+ f"- Total crosstab documents: {total_crosstab_docs}\n"
993
+ f"- Namespace: {namespace}\n"
994
+ )
995
+ else:
996
+ retrieval_summary.append(f"Stage {stage_num}: No crosstab data retrieved")
997
+
998
  # TODO: REMOVE WHEN PIPELINES READY - START
999
  # Check if any pipelines were unavailable
1000
  if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
1001
  unavailable = stage_result.extracted_context["unavailable_pipelines"]
1002
  unavailable_pipelines_found.extend(unavailable)
1003
+ retrieval_summary.append(f"Stage {stage_num}: ⚠️ {', '.join(unavailable)} not yet available")
1004
  # TODO: REMOVE WHEN PIPELINES READY - END
1005
+
1006
  if not retrieval_summary:
1007
  retrieval_summary.append("No data was retrieved")
1008
+
1009
+ # Auto-fail if we got 0 results across all data types
1010
+ if total_questions == 0 and total_toplines_docs == 0 and total_crosstab_variables == 0:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1011
  if self.verbose:
1012
  print("✗ Auto-fail: No results retrieved (skipping retry - data doesn't exist)")
1013
+
1014
  return {
1015
  "verification": VerificationResult(
1016
  answers_question=False,
 
1019
  ),
1020
  "retry_count": state.get("max_retries", self.max_retries) # FIXED: Skip retry
1021
  }
1022
+
1023
+ # For other cases, use LLM verification with raw data details
1024
+ system_prompt = _load_prompt_file("verification_prompt_system.txt")
1025
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1026
  verifier = self.llm.with_structured_output(VerificationResult)
 
 
 
 
 
1027
 
1028
+ # Build detailed verification prompt with raw data
1029
+ verification_prompt_template = _load_prompt_file("verification_prompt_user.txt")
1030
+ filters_info = chr(10).join(filters_applied_list) if filters_applied_list else "No explicit filters applied"
1031
+ verification_prompt = verification_prompt_template.format(
1032
+ question=question,
1033
+ retrieval_summary=chr(10).join(retrieval_summary),
1034
+ raw_data_details=chr(10).join(raw_data_details) if raw_data_details else 'No detailed data available',
1035
+ filters_applied=filters_info
1036
+ )
1037
 
1038
+ verification = verifier.invoke([
1039
+ SystemMessage(content=system_prompt),
1040
+ HumanMessage(content=verification_prompt)
1041
  ])
1042
+
1043
  if self.verbose:
1044
  print(f"Answers question: {verification.answers_question}")
1045
  if not verification.answers_question:
1046
  print(f"Missing: {verification.missing_info}")
1047
  print(f"Suggestion: {verification.improvement_suggestion}")
1048
+
1049
  # Increment retry count if verification fails
1050
  updates = {"verification": verification}
1051
  if not verification.answers_question:
1052
  current_retry = state.get("retry_count", 0)
1053
  updates["retry_count"] = current_retry + 1
1054
+
1055
  return updates
1056
+
1057
  def _route_after_verification(self, state: SurveyAnalysisState) -> str:
1058
  """Route based on verification result"""
1059
+
1060
  verification = state["verification"]
1061
  retry_count = state.get("retry_count", 0)
1062
  max_retries = state.get("max_retries", self.max_retries)
1063
+
1064
  if verification.answers_question:
1065
  return "synthesize"
1066
  elif retry_count < max_retries:
 
1071
  if self.verbose:
1072
  print(f"\n⚠️ Max retries reached, proceeding with partial results")
1073
  return "give_up"
1074
+
1075
  def _synthesize_response(self, state: SurveyAnalysisState) -> Dict[str, Any]:
1076
  """Synthesize final response from all results"""
1077
+
1078
  if self.verbose:
1079
  print("\n=== SYNTHESIZING RESPONSE ===")
1080
+
1081
  brief = state["research_brief"]
1082
+
1083
  # Use the latest question
1084
  full_question = self._get_full_question_context(state)
1085
+
1086
  # Handle followup action
1087
  if brief.action == "followup":
1088
  if self.verbose:
 
1091
  "final_answer": brief.followup_question,
1092
  "messages": [AIMessage(content=brief.followup_question)]
1093
  }
1094
+
1095
  # Handle direct answer (no data retrieval)
1096
  if brief.action == "answer":
1097
  if self.verbose:
 
1100
  SystemMessage(content="Answer the user's question directly."),
1101
  HumanMessage(content=full_question)
1102
  ]).content
1103
+
1104
  return {
1105
  "final_answer": answer,
1106
  "messages": [AIMessage(content=answer)]
1107
  }
1108
+
1109
  # Get stage results
1110
  stage_results = state.get("stage_results", [])
1111
+
1112
  if not stage_results:
1113
  if self.verbose:
1114
  print("No stage results available")
 
1116
  "final_answer": "I was unable to retrieve any data to answer your question.",
1117
  "messages": [AIMessage(content="I was unable to retrieve any data to answer your question.")]
1118
  }
1119
+
1120
+ # Always synthesize from raw data (removed direct answer path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1121
  if self.verbose:
1122
+ print(f"Synthesizing from {len(stage_results)} stage(s) using raw data")
1123
+
1124
+ # Build context from raw data structures
1125
  context_parts = []
1126
+
1127
  # TODO: REMOVE WHEN PIPELINES READY - START
1128
  unavailable_pipelines_overall = []
1129
  # TODO: REMOVE WHEN PIPELINES READY - END
1130
+
1131
  for i, stage_result in enumerate(stage_results, 1):
1132
+ # Format questionnaire raw data
1133
  if stage_result.questionnaire_results:
1134
  q_res = stage_result.questionnaire_results
1135
+ # Handle multiple results
1136
+ if isinstance(q_res, dict) and q_res.get("multiple"):
1137
+ all_q_results = q_res.get("results", [])
1138
+ else:
1139
+ all_q_results = [q_res]
1140
+
1141
+ for q_result in all_q_results:
1142
+ label = q_result.get("label", f"Stage {i}")
1143
+ source_questions = q_result.get("source_questions", [])
1144
+
1145
+ context_parts.append(f"\n=== {label.upper()} (QUESTIONNAIRE DATA) ===")
1146
+
1147
+ if not source_questions:
1148
+ context_parts.append("No questionnaire questions retrieved.")
1149
+ else:
1150
+ context_parts.append(f"Retrieved {len(source_questions)} question(s):\n")
1151
+
1152
+ # Format each question
1153
+ for j, q in enumerate(source_questions, 1):
1154
+ q_parts = [
1155
+ f"Question {j}: {q.get('question_text', 'N/A')}",
1156
+ f"Variable: {q.get('variable_name', 'N/A')}",
1157
+ f"Poll: {q.get('poll_date', 'N/A')}",
1158
+ f"Response Options: {' | '.join(q.get('response_options', []))}",
1159
+ ]
1160
+
1161
+ topics = q.get("topics", [])
1162
+ if isinstance(topics, str):
1163
+ topics = [t.strip() for t in topics.split(",")]
1164
+ if topics:
1165
+ q_parts.append(f"Topics: {', '.join(topics)}")
1166
+
1167
+ context_parts.append("\n".join(q_parts))
1168
+ context_parts.append("") # Blank line between questions
1169
+
1170
+ # Format toplines raw data
1171
+ if stage_result.toplines_results:
1172
+ t_res = stage_result.toplines_results
1173
+ # Handle multiple results
1174
+ if isinstance(t_res, dict) and t_res.get("multiple"):
1175
+ all_t_results = t_res.get("results", [])
1176
+ else:
1177
+ all_t_results = [t_res]
1178
 
1179
+ for t_result in all_t_results:
1180
+ label = t_result.get("label", f"Stage {i}")
1181
+ retrieved_docs = t_result.get("retrieved_docs", [])
1182
+
1183
+ context_parts.append(f"\n=== {label.upper()} (TOPLINES DATA) ===")
1184
+
1185
+ if not retrieved_docs:
1186
+ context_parts.append("No topline documents retrieved.")
1187
+ else:
1188
+ context_parts.append(f"Retrieved {len(retrieved_docs)} topline document(s):\n")
1189
+
1190
+ # Format each topline document - include full content
1191
+ for j, doc in enumerate(retrieved_docs, 1):
1192
+ metadata = doc.metadata or {}
1193
+ content = doc.page_content or ""
1194
+
1195
+ doc_parts = [
1196
+ f"--- Topline Document {j} ---",
1197
+ f"Survey: {metadata.get('survey_name', 'Vanderbilt Unity Poll')} ({metadata.get('month', '')} {metadata.get('year', '')})",
1198
+ f"Poll Date: {metadata.get('poll_date', 'N/A')}",
1199
+ f"Variable: {metadata.get('variable_name', 'N/A')}",
1200
+ f"Response: {metadata.get('response_label', 'N/A')}",
1201
+ f"Percentage: {metadata.get('pct', 'N/A')}%",
1202
+ ]
1203
+
1204
+ if content:
1205
+ doc_parts.append(f"\nFull Content:")
1206
+ doc_parts.append(content) # FULL content, no truncation
1207
+
1208
+ context_parts.append("\n".join(doc_parts))
1209
+ context_parts.append("") # Blank line between documents
1210
+
1211
+ # Format crosstabs raw data
1212
+ if stage_result.crosstabs_results:
1213
+ c_res = stage_result.crosstabs_results
1214
+ # Handle multiple results
1215
+ if isinstance(c_res, dict) and c_res.get("multiple"):
1216
+ all_c_results = c_res.get("results", [])
1217
+ else:
1218
+ all_c_results = [c_res]
1219
 
1220
+ for c_result in all_c_results:
1221
+ if "error" in c_result:
1222
+ context_parts.append(f"\n=== Stage {i} (CROSSTABS DATA) ===")
1223
+ context_parts.append(f"Error: {c_result['error']}")
1224
+ continue
1225
+
1226
+ label = c_result.get("label", f"Stage {i}")
1227
+ crosstab_docs_by_var = c_result.get("crosstab_docs_by_variable", {})
1228
+ matched_vars = c_result.get("matched_variables", [])
1229
+ namespace = c_result.get("namespace_used", "unknown")
1230
+ survey_info = c_result.get("survey_info", {})
1231
+
1232
+ context_parts.append(f"\n=== {label.upper()} (CROSSTABS DATA) ===")
1233
+ context_parts.append(
1234
+ f"Survey: {survey_info.get('poll', 'Unknown')} "
1235
+ f"({survey_info.get('month', '')} {survey_info.get('year', '')})"
1236
+ )
1237
+ context_parts.append(f"Namespace: {namespace}")
1238
+ context_parts.append(f"Matched {len(matched_vars)} variable(s): {', '.join(matched_vars)}\n")
1239
+
1240
+ if not crosstab_docs_by_var:
1241
+ context_parts.append("No crosstab documents retrieved.")
1242
+ else:
1243
+ # Format crosstab data for each variable - include ALL content for full analysis
1244
+ for var_name, var_data in crosstab_docs_by_var.items():
1245
+ crosstab_docs = var_data.get("crosstab_docs", [])
1246
+ question_text = var_data.get("question_text", "")
1247
+
1248
+ context_parts.append(f"\n{'='*80}")
1249
+ context_parts.append(f"Variable: {var_name}")
1250
+ context_parts.append(f"Question: {question_text}")
1251
+ context_parts.append(f"{'='*80}\n")
1252
+
1253
+ # Include ALL chunks with FULL content - no truncation
1254
+ # Sort by chunk_index to maintain order
1255
+ sorted_docs = sorted(crosstab_docs, key=lambda d: d.metadata.get("chunk_index", 999))
1256
+
1257
+ for doc in sorted_docs:
1258
+ content = doc.page_content or ""
1259
+ chunk_idx = doc.metadata.get("chunk_index", "?")
1260
+ context_parts.append(f"--- Crosstab Data Chunk {chunk_idx} ---")
1261
+ context_parts.append(content) # FULL content, no truncation
1262
+ context_parts.append("") # Blank line between chunks
1263
+
1264
+ context_parts.append("") # Extra blank line between variables
1265
+
1266
  # TODO: REMOVE WHEN PIPELINES READY - START
1267
  # Track unavailable pipelines for note in synthesis
1268
  if stage_result.extracted_context and "unavailable_pipelines" in stage_result.extracted_context:
 
1270
  unavailable_pipelines_overall.extend(unavailable)
1271
  context_parts.append(f"\n⚠️ Note: {', '.join(unavailable)} data was requested but not yet available")
1272
  # TODO: REMOVE WHEN PIPELINES READY - END
1273
+
1274
  # TODO: REMOVE WHEN PIPELINES READY - START
1275
  unavailable_note = ""
1276
  if unavailable_pipelines_overall:
 
1283
  Please answer based on the questionnaire data that IS available, and note any limitations.
1284
  """
1285
  # TODO: REMOVE WHEN PIPELINES READY - END
 
 
1286
 
1287
+ synthesis_prompt_template = _load_prompt_file("synthesis_prompt_user.txt")
1288
+ synthesis_prompt = synthesis_prompt_template.format(
1289
+ stage_count='multiple stages' if len(stage_results) > 1 else 'the research',
1290
+ full_question=full_question,
1291
+ reasoning=brief.reasoning,
1292
+ context_parts=chr(10).join(context_parts),
1293
+ unavailable_note=unavailable_note
1294
+ )
1295
 
1296
+ synthesis_system_prompt = _load_prompt_file("synthesis_prompt_system.txt")
 
 
 
 
 
 
 
 
 
1297
  final_answer = self.llm.invoke([
1298
+ SystemMessage(content=synthesis_system_prompt),
1299
  HumanMessage(content=synthesis_prompt)
1300
  ]).content
1301
+
1302
  if self.verbose:
1303
  print("Synthesis complete")
1304
+
1305
  return {
1306
  "final_answer": final_answer,
1307
  "messages": [AIMessage(content=final_answer)]
1308
  }
1309
+
1310
  # ========================================================================
1311
  # PUBLIC API
1312
  # ========================================================================
1313
+
1314
  def query(self, question: str, thread_id: str = "default") -> str:
1315
  """
1316
  Query the survey analysis system.
1317
+
1318
  Args:
1319
  question: User's question
1320
  thread_id: Conversation thread ID for memory
1321
+
1322
  Returns:
1323
  Answer string
1324
  """
1325
+
1326
  # Create initial state for this turn
1327
  initial_state = {
1328
  "messages": [HumanMessage(content=question)],
 
1339
  "retry_count": 0,
1340
  "max_retries": self.max_retries
1341
  }
1342
+
1343
  config = {
1344
  "configurable": {"thread_id": thread_id},
1345
  "recursion_limit": 50 # FIXED: Increased from default 25
1346
  }
1347
+
1348
  if self.verbose:
1349
  print(f"\n🧵 Thread ID: {thread_id}")
1350
+
1351
  final_state = self.graph.invoke(initial_state, config)
1352
+
1353
  return final_state["final_answer"]
1354
+
1355
  def stream_query(self, question: str, thread_id: str = "default"):
1356
  """Stream the query execution for real-time updates"""
1357
+
1358
  initial_state = {
1359
  "messages": [HumanMessage(content=question)],
1360
  "user_question": question,
 
1370
  "retry_count": 0,
1371
  "max_retries": self.max_retries
1372
  }
1373
+
1374
  config = {
1375
  "configurable": {"thread_id": thread_id},
1376
  "recursion_limit": 50 # FIXED: Increased from default 25
1377
  }
1378
+
1379
  for event in self.graph.stream(initial_state, config):
1380
  yield event
1381
 
 
1387
  def main():
1388
  """Interactive CLI"""
1389
  import sys
1390
+
1391
  openai_api_key = os.getenv("OPENAI_API_KEY")
1392
  pinecone_api_key = os.getenv("PINECONE_API_KEY")
1393
+
1394
  if not openai_api_key or not pinecone_api_key:
1395
  print("Error: Missing API keys")
1396
  print("Set OPENAI_API_KEY and PINECONE_API_KEY environment variables")
1397
  sys.exit(1)
1398
+
1399
  print("Initializing survey analysis agent...")
1400
  agent = SurveyAnalysisAgent(
1401
  openai_api_key=openai_api_key,
1402
  pinecone_api_key=pinecone_api_key,
1403
  verbose=True
1404
  )
1405
+
1406
  print("\n" + "="*80)
1407
  print("SURVEY ANALYSIS AGENT (WITH STAGED RESEARCH)")
1408
  print("="*80)
1409
  print("\nType 'quit' to exit\n")
1410
+
1411
  thread_id = "cli_session"
1412
+
1413
  while True:
1414
  try:
1415
  question = input("\nYour question: ").strip()
1416
+
1417
  if not question or question.lower() in ['quit', 'exit', 'q']:
1418
  print("\nGoodbye!")
1419
  break
1420
+
1421
  print("\n" + "-"*80)
1422
  answer = agent.query(question, thread_id=thread_id)
1423
  print("\n" + "="*80)
 
1425
  print("="*80)
1426
  print(answer)
1427
  print("="*80)
1428
+
1429
  except KeyboardInterrupt:
1430
  print("\n\nGoodbye!")
1431
  break
 
1436
 
1437
 
1438
  if __name__ == "__main__":
1439
+ main()
toplines_rag.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ToplinesRAG
3
+ -----------
4
+ Queries the prebuilt Pinecone toplines vectorstore and synthesizes
5
+ a natural-language answer with citations using OpenAI.
6
+ """
7
+
8
+ import os
9
+ import re
10
+
11
+ from pathlib import Path
12
+ from typing import Any, Dict, List, Optional
13
+ from dotenv import load_dotenv
14
+ from langchain_openai import OpenAIEmbeddings, ChatOpenAI
15
+ from langchain_pinecone import PineconeVectorStore
16
+ from pinecone import Pinecone
17
+ from calendar import month_name
18
+
19
+ load_dotenv()
20
+
21
+
22
+ def _load_prompt_file(filename: str) -> str:
23
+ """Load a prompt file from the prompts directory"""
24
+ prompt_dir = Path(__file__).parent / "prompts"
25
+ prompt_path = prompt_dir / filename
26
+ if not prompt_path.exists():
27
+ raise FileNotFoundError(f"Prompt file not found: {prompt_path}")
28
+ return prompt_path.read_text(encoding="utf-8")
29
+
30
+
31
+ class ToplinesRAG:
32
+ def __init__(
33
+ self,
34
+ persist_directory: str = "./toplines_vectorstores",
35
+ index_name: Optional[str] = None,
36
+ llm_model: str = "gpt-4-turbo",
37
+ ):
38
+ self.persist_directory = Path(persist_directory)
39
+ self.index_name = index_name or os.getenv("PINECONE_INDEX_NAME_TOPLINES", "toplines-index")
40
+ self.namespace = os.getenv("PINECONE_NAMESPACE") or None
41
+
42
+ self.openai_api_key = os.getenv("OPENAI_API_KEY")
43
+ if not self.openai_api_key:
44
+ raise ValueError("OPENAI_API_KEY not set")
45
+
46
+ pinecone_api_key = os.getenv("PINECONE_API_KEY_TOPLINES")
47
+ if not pinecone_api_key:
48
+ raise ValueError("PINECONE_API_KEY_TOPLINES not set")
49
+
50
+ self.embeddings = OpenAIEmbeddings(
51
+ model=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small")
52
+ )
53
+ self.pc = Pinecone(api_key=pinecone_api_key)
54
+ self.index = self.pc.Index(self.index_name)
55
+ self.vector_store = PineconeVectorStore(
56
+ index=self.index, embedding=self.embeddings, namespace=self.namespace
57
+ )
58
+
59
+ self.llm_model = llm_model
60
+ self.llm = ChatOpenAI(
61
+ model=self.llm_model,
62
+ openai_api_key=self.openai_api_key,
63
+ temperature=0
64
+ )
65
+
66
+ # ----------------------------------------------------------
67
+ def _build_filter(self, filters: Dict[str, Any]) -> Optional[Dict]:
68
+ """
69
+ Build Pinecone filter from filters dict.
70
+ Only includes valid metadata fields that exist in the vectorstore.
71
+ Ignores unsupported fields like 'topic', 'question_ids', etc.
72
+ """
73
+ if not filters:
74
+ return None
75
+
76
+ # Valid filter fields that exist in toplines metadata
77
+ VALID_FILTER_FIELDS = {"year", "month", "poll_date", "survey_name"}
78
+
79
+ # Filter to only include valid fields
80
+ valid_filters = {k: v for k, v in filters.items()
81
+ if k in VALID_FILTER_FIELDS and v is not None}
82
+
83
+ if not valid_filters:
84
+ return None
85
+
86
+ clauses = [{k: {"$eq": str(v)}} for k, v in valid_filters.items()]
87
+ return {"$and": clauses} if len(clauses) > 1 else clauses[0]
88
+
89
+ # ----------------------------------------------------------
90
+ def _extract_filters_from_query(self, query: str) -> Dict[str, str]:
91
+ filters = {}
92
+ year_match = re.search(r"20\d{2}", query)
93
+ if year_match:
94
+ filters["year"] = year_match.group()
95
+ for i in range(1, 13):
96
+ if month_name[i].lower() in query.lower():
97
+ filters["month"] = month_name[i]
98
+ break
99
+ return filters
100
+
101
+ # ----------------------------------------------------------
102
+ def _synthesize_answer(self, query: str, docs: List[Dict]) -> str:
103
+ """Generate a human-readable answer from the retrieved docs."""
104
+ if not docs:
105
+ # No docs retrieved → truly irrelevant query
106
+ return (
107
+ "Your query does not match any Vanderbilt Unity Poll data. "
108
+ "This system only provides information from those polls."
109
+ )
110
+
111
+ # Format retrieved documents for context
112
+ context_snippets = "\n\n".join(
113
+ f"Survey: {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
114
+ f"({d.metadata.get('month', '')} {d.metadata.get('year', '')})\n"
115
+ f"Question: {d.metadata.get('variable_name', '')}\n"
116
+ f"Response: {d.metadata.get('response_label', '')}\n"
117
+ f"Pct: {d.metadata.get('pct', 'N/A')}\n"
118
+ f"Poll Date: {d.metadata.get('poll_date', 'N/A')}"
119
+ for d in docs
120
+ )
121
+
122
+ # Load prompt from file
123
+ prompt_template = _load_prompt_file("toplines_rag_prompt.txt")
124
+ prompt = prompt_template.format(
125
+ query=query,
126
+ context_snippets=context_snippets
127
+ )
128
+
129
+ completion = self.llm.invoke(prompt)
130
+ answer_text = completion.content.strip()
131
+
132
+ # Build sources section
133
+ sources = [
134
+ f"- {d.metadata.get('survey_name', 'Vanderbilt Unity Poll')} "
135
+ f"({d.metadata.get('poll_date', 'N/A')}) | Variable: {d.metadata.get('variable_name', 'N/A')}"
136
+ for d in docs
137
+ ]
138
+
139
+ return f"\n--- ANSWER ---\n\n{answer_text}\n\n--- SOURCES ---\n" + "\n".join(sources)
140
+
141
+ # ----------------------------------------------------------
142
+ def query_toplines(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> str:
143
+ pinecone_filter = self._build_filter(filters or {})
144
+
145
+ # Try with filters first, but if no results, try without filters to see if data exists
146
+ docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
147
+
148
+ # If no results with filters but filters were provided, try a broader search
149
+ if not docs and pinecone_filter:
150
+ # Try without filters to see if the query matches anything
151
+ docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
152
+ if docs_no_filter:
153
+ # Filter results manually by matching metadata
154
+ valid_filters = {k: str(v) for k, v in (filters or {}).items()
155
+ if k in {"year", "month", "poll_date", "survey_name"} and v}
156
+ docs = [
157
+ d for d in docs_no_filter
158
+ if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
159
+ ]
160
+ # If still no matches after manual filtering, use the broader results
161
+ if not docs:
162
+ docs = docs_no_filter[:top_k]
163
+
164
+ return self._synthesize_answer(query, docs)
165
+
166
+ # ----------------------------------------------------------
167
+ def retrieve_raw_data(self, query: str, filters: Optional[Dict[str, Any]] = None, top_k: int = 5) -> Dict[str, Any]:
168
+ """
169
+ Retrieve raw data without LLM synthesis.
170
+ Used by agent framework to get raw data for synthesis.
171
+
172
+ Returns:
173
+ Dict with 'retrieved_docs', 'num_sources', 'filters_applied'
174
+ """
175
+ pinecone_filter = self._build_filter(filters or {})
176
+
177
+ # Try with filters first, but if no results, try without filters to see if data exists
178
+ docs = self.vector_store.similarity_search(query, k=top_k, filter=pinecone_filter)
179
+
180
+ # If no results with filters but filters were provided, try a broader search
181
+ if not docs and pinecone_filter:
182
+ # Try without filters to see if the query matches anything
183
+ docs_no_filter = self.vector_store.similarity_search(query, k=top_k * 2)
184
+ if docs_no_filter:
185
+ # Filter results manually by matching metadata
186
+ valid_filters = {k: str(v) for k, v in (filters or {}).items()
187
+ if k in {"year", "month", "poll_date", "survey_name"} and v}
188
+ docs = [
189
+ d for d in docs_no_filter
190
+ if all(str(d.metadata.get(k, "")) == str(v) for k, v in valid_filters.items())
191
+ ]
192
+ # If still no matches after manual filtering, use the broader results
193
+ if not docs:
194
+ docs = docs_no_filter[:top_k]
195
+
196
+ return {
197
+ "retrieved_docs": docs,
198
+ "num_sources": len(docs),
199
+ "filters_applied": filters or {}
200
+ }
201
+
202
+ # ----------------------------------------------------------
203
+ def interactive_loop(self):
204
+ print("ToplinesRAG ready! Type 'quit' or 'exit' to stop.\n")
205
+ while True:
206
+ query = input("Enter your poll question: ").strip()
207
+ if query.lower() in ("quit", "exit"):
208
+ print("Exiting ToplinesRAG. Goodbye!")
209
+ break
210
+ filters = self._extract_filters_from_query(query)
211
+ if filters:
212
+ print(f"Using filters: {filters}")
213
+ print("\nRetrieving answer...\n")
214
+ answer = self.query_toplines(query, filters=filters)
215
+ print(answer)
216
+ print("\n" + "-"*60 + "\n")
217
+
218
+
219
+ if __name__ == "__main__":
220
+ rag = ToplinesRAG()
221
+ rag.interactive_loop()
toplines_vectorstores/poll_catalog_toplines.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "2025-February": {
3
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2025_February_toplines.json",
4
+ "poll_date": "2025-February",
5
+ "num_toplines": 41,
6
+ "survey_name": "Vanderbilt Unity Poll",
7
+ "year": "2025",
8
+ "month": "February"
9
+ },
10
+ "2025-June": {
11
+ "file": "toplines_data/Vanderbilt_Unity_Poll_2025_June_toplines.json",
12
+ "poll_date": "2025-June",
13
+ "num_toplines": 167,
14
+ "survey_name": "Vanderbilt Unity Poll",
15
+ "year": "2025",
16
+ "month": "June"
17
+ }
18
+ }
toplines_vectorstores/toplines_index.json ADDED
@@ -0,0 +1,2290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "2025_February_0": {
3
+ "topline_id": "2025_February_0",
4
+ "poll_date": "2025-February",
5
+ "year": "2025",
6
+ "month": "February",
7
+ "survey_name": "Vanderbilt Unity Poll",
8
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
9
+ "response_label": "The powers of the presidency should be increased",
10
+ "pct": "7",
11
+ "count": ""
12
+ },
13
+ "2025_February_1": {
14
+ "topline_id": "2025_February_1",
15
+ "poll_date": "2025-February",
16
+ "year": "2025",
17
+ "month": "February",
18
+ "survey_name": "Vanderbilt Unity Poll",
19
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
20
+ "response_label": "The powers of the presidency should be decreased",
21
+ "pct": "37",
22
+ "count": ""
23
+ },
24
+ "2025_February_2": {
25
+ "topline_id": "2025_February_2",
26
+ "poll_date": "2025-February",
27
+ "year": "2025",
28
+ "month": "February",
29
+ "survey_name": "Vanderbilt Unity Poll",
30
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
31
+ "response_label": "The system needs reform, but there is no need to change the powers of the presidency",
32
+ "pct": "51",
33
+ "count": ""
34
+ },
35
+ "2025_February_3": {
36
+ "topline_id": "2025_February_3",
37
+ "poll_date": "2025-February",
38
+ "year": "2025",
39
+ "month": "February",
40
+ "survey_name": "Vanderbilt Unity Poll",
41
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
42
+ "response_label": "The system does not need reform",
43
+ "pct": "5",
44
+ "count": ""
45
+ },
46
+ "2025_February_4": {
47
+ "topline_id": "2025_February_4",
48
+ "poll_date": "2025-February",
49
+ "year": "2025",
50
+ "month": "February",
51
+ "survey_name": "Vanderbilt Unity Poll",
52
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
53
+ "response_label": "Don't know",
54
+ "pct": "0",
55
+ "count": ""
56
+ },
57
+ "2025_February_5": {
58
+ "topline_id": "2025_February_5",
59
+ "poll_date": "2025-February",
60
+ "year": "2025",
61
+ "month": "February",
62
+ "survey_name": "Vanderbilt Unity Poll",
63
+ "variable_name": "Many Americans believe our political system needs reform. One area of possible reform involves the powers of the American presidency. Which of the following comes closest to your opinion?",
64
+ "response_label": "Refused",
65
+ "pct": "0",
66
+ "count": ""
67
+ },
68
+ "2025_February_6": {
69
+ "topline_id": "2025_February_6",
70
+ "poll_date": "2025-February",
71
+ "year": "2025",
72
+ "month": "February",
73
+ "survey_name": "Vanderbilt Unity Poll",
74
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
75
+ "response_label": "Mostly united",
76
+ "pct": "4",
77
+ "count": ""
78
+ },
79
+ "2025_February_7": {
80
+ "topline_id": "2025_February_7",
81
+ "poll_date": "2025-February",
82
+ "year": "2025",
83
+ "month": "February",
84
+ "survey_name": "Vanderbilt Unity Poll",
85
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
86
+ "response_label": "Somewhat united",
87
+ "pct": "16",
88
+ "count": ""
89
+ },
90
+ "2025_February_8": {
91
+ "topline_id": "2025_February_8",
92
+ "poll_date": "2025-February",
93
+ "year": "2025",
94
+ "month": "February",
95
+ "survey_name": "Vanderbilt Unity Poll",
96
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
97
+ "response_label": "Somewhat divided",
98
+ "pct": "35",
99
+ "count": ""
100
+ },
101
+ "2025_February_9": {
102
+ "topline_id": "2025_February_9",
103
+ "poll_date": "2025-February",
104
+ "year": "2025",
105
+ "month": "February",
106
+ "survey_name": "Vanderbilt Unity Poll",
107
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
108
+ "response_label": "Mostly divided",
109
+ "pct": "45",
110
+ "count": ""
111
+ },
112
+ "2025_February_10": {
113
+ "topline_id": "2025_February_10",
114
+ "poll_date": "2025-February",
115
+ "year": "2025",
116
+ "month": "February",
117
+ "survey_name": "Vanderbilt Unity Poll",
118
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
119
+ "response_label": "Don't know",
120
+ "pct": "0",
121
+ "count": ""
122
+ },
123
+ "2025_February_11": {
124
+ "topline_id": "2025_February_11",
125
+ "poll_date": "2025-February",
126
+ "year": "2025",
127
+ "month": "February",
128
+ "survey_name": "Vanderbilt Unity Poll",
129
+ "variable_name": "When it comes to the most pressing issues facing the country today, in general, would you say ordinary Americans are:",
130
+ "response_label": "Refused",
131
+ "pct": "0",
132
+ "count": ""
133
+ },
134
+ "2025_February_12": {
135
+ "topline_id": "2025_February_12",
136
+ "poll_date": "2025-February",
137
+ "year": "2025",
138
+ "month": "February",
139
+ "survey_name": "Vanderbilt Unity Poll",
140
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
141
+ "response_label": "Very likely",
142
+ "pct": "6",
143
+ "count": ""
144
+ },
145
+ "2025_February_13": {
146
+ "topline_id": "2025_February_13",
147
+ "poll_date": "2025-February",
148
+ "year": "2025",
149
+ "month": "February",
150
+ "survey_name": "Vanderbilt Unity Poll",
151
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
152
+ "response_label": "Somewhat likely",
153
+ "pct": "33",
154
+ "count": ""
155
+ },
156
+ "2025_February_14": {
157
+ "topline_id": "2025_February_14",
158
+ "poll_date": "2025-February",
159
+ "year": "2025",
160
+ "month": "February",
161
+ "survey_name": "Vanderbilt Unity Poll",
162
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
163
+ "response_label": "Somewhat unlikely",
164
+ "pct": "39",
165
+ "count": ""
166
+ },
167
+ "2025_February_15": {
168
+ "topline_id": "2025_February_15",
169
+ "poll_date": "2025-February",
170
+ "year": "2025",
171
+ "month": "February",
172
+ "survey_name": "Vanderbilt Unity Poll",
173
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
174
+ "response_label": "Very unlikely",
175
+ "pct": "22",
176
+ "count": ""
177
+ },
178
+ "2025_February_16": {
179
+ "topline_id": "2025_February_16",
180
+ "poll_date": "2025-February",
181
+ "year": "2025",
182
+ "month": "February",
183
+ "survey_name": "Vanderbilt Unity Poll",
184
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
185
+ "response_label": "Don't know",
186
+ "pct": "0",
187
+ "count": ""
188
+ },
189
+ "2025_February_17": {
190
+ "topline_id": "2025_February_17",
191
+ "poll_date": "2025-February",
192
+ "year": "2025",
193
+ "month": "February",
194
+ "survey_name": "Vanderbilt Unity Poll",
195
+ "variable_name": "When trying to solve important problems facing the country today, how likely, if at all, do you think it is that Americans will unite?",
196
+ "response_label": "Refused",
197
+ "pct": "0",
198
+ "count": ""
199
+ },
200
+ "2025_February_18": {
201
+ "topline_id": "2025_February_18",
202
+ "poll_date": "2025-February",
203
+ "year": "2025",
204
+ "month": "February",
205
+ "survey_name": "Vanderbilt Unity Poll",
206
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
207
+ "response_label": "Every day",
208
+ "pct": "5",
209
+ "count": ""
210
+ },
211
+ "2025_February_19": {
212
+ "topline_id": "2025_February_19",
213
+ "poll_date": "2025-February",
214
+ "year": "2025",
215
+ "month": "February",
216
+ "survey_name": "Vanderbilt Unity Poll",
217
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
218
+ "response_label": "Every few days",
219
+ "pct": "9",
220
+ "count": ""
221
+ },
222
+ "2025_February_20": {
223
+ "topline_id": "2025_February_20",
224
+ "poll_date": "2025-February",
225
+ "year": "2025",
226
+ "month": "February",
227
+ "survey_name": "Vanderbilt Unity Poll",
228
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
229
+ "response_label": "Once a week",
230
+ "pct": "10",
231
+ "count": ""
232
+ },
233
+ "2025_February_21": {
234
+ "topline_id": "2025_February_21",
235
+ "poll_date": "2025-February",
236
+ "year": "2025",
237
+ "month": "February",
238
+ "survey_name": "Vanderbilt Unity Poll",
239
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
240
+ "response_label": "A few times a month",
241
+ "pct": "18",
242
+ "count": ""
243
+ },
244
+ "2025_February_22": {
245
+ "topline_id": "2025_February_22",
246
+ "poll_date": "2025-February",
247
+ "year": "2025",
248
+ "month": "February",
249
+ "survey_name": "Vanderbilt Unity Poll",
250
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
251
+ "response_label": "A few times a year",
252
+ "pct": "26",
253
+ "count": ""
254
+ },
255
+ "2025_February_23": {
256
+ "topline_id": "2025_February_23",
257
+ "poll_date": "2025-February",
258
+ "year": "2025",
259
+ "month": "February",
260
+ "survey_name": "Vanderbilt Unity Poll",
261
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
262
+ "response_label": "Never \u2013 I talk about politics, but not with anyone who has an opposing political viewpoint",
263
+ "pct": "13",
264
+ "count": ""
265
+ },
266
+ "2025_February_24": {
267
+ "topline_id": "2025_February_24",
268
+ "poll_date": "2025-February",
269
+ "year": "2025",
270
+ "month": "February",
271
+ "survey_name": "Vanderbilt Unity Poll",
272
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
273
+ "response_label": "Never - I don\u2019t talk about politics",
274
+ "pct": "19",
275
+ "count": ""
276
+ },
277
+ "2025_February_25": {
278
+ "topline_id": "2025_February_25",
279
+ "poll_date": "2025-February",
280
+ "year": "2025",
281
+ "month": "February",
282
+ "survey_name": "Vanderbilt Unity Poll",
283
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
284
+ "response_label": "Don't know",
285
+ "pct": "0",
286
+ "count": ""
287
+ },
288
+ "2025_February_26": {
289
+ "topline_id": "2025_February_26",
290
+ "poll_date": "2025-February",
291
+ "year": "2025",
292
+ "month": "February",
293
+ "survey_name": "Vanderbilt Unity Poll",
294
+ "variable_name": "How often do you talk about politics with someone who has an opposing political viewpoint?",
295
+ "response_label": "Refused",
296
+ "pct": "0",
297
+ "count": ""
298
+ },
299
+ "2025_February_27": {
300
+ "topline_id": "2025_February_27",
301
+ "poll_date": "2025-February",
302
+ "year": "2025",
303
+ "month": "February",
304
+ "survey_name": "Vanderbilt Unity Poll",
305
+ "variable_name": "Which of the following comes closest to your view, even if neither is exactly right?",
306
+ "response_label": "I would prefer my elected officials work with members of the other political party even if it means they have to compromise on some of their values and priorities.",
307
+ "pct": "76",
308
+ "count": ""
309
+ },
310
+ "2025_February_28": {
311
+ "topline_id": "2025_February_28",
312
+ "poll_date": "2025-February",
313
+ "year": "2025",
314
+ "month": "February",
315
+ "survey_name": "Vanderbilt Unity Poll",
316
+ "variable_name": "Which of the following comes closest to your view, even if neither is exactly right?",
317
+ "response_label": "I would prefer my elected officials pursue their own values and priorities even if it means they are unwilling to work with members of the other political party.",
318
+ "pct": "24",
319
+ "count": ""
320
+ },
321
+ "2025_February_29": {
322
+ "topline_id": "2025_February_29",
323
+ "poll_date": "2025-February",
324
+ "year": "2025",
325
+ "month": "February",
326
+ "survey_name": "Vanderbilt Unity Poll",
327
+ "variable_name": "Which of the following comes closest to your view, even if neither is exactly right?",
328
+ "response_label": "Don't know",
329
+ "pct": "0",
330
+ "count": ""
331
+ },
332
+ "2025_February_30": {
333
+ "topline_id": "2025_February_30",
334
+ "poll_date": "2025-February",
335
+ "year": "2025",
336
+ "month": "February",
337
+ "survey_name": "Vanderbilt Unity Poll",
338
+ "variable_name": "Which of the following comes closest to your view, even if neither is exactly right?",
339
+ "response_label": "Refused",
340
+ "pct": "0",
341
+ "count": ""
342
+ },
343
+ "2025_February_31": {
344
+ "topline_id": "2025_February_31",
345
+ "poll_date": "2025-February",
346
+ "year": "2025",
347
+ "month": "February",
348
+ "survey_name": "Vanderbilt Unity Poll",
349
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
350
+ "response_label": "Very confident",
351
+ "pct": "7",
352
+ "count": ""
353
+ },
354
+ "2025_February_32": {
355
+ "topline_id": "2025_February_32",
356
+ "poll_date": "2025-February",
357
+ "year": "2025",
358
+ "month": "February",
359
+ "survey_name": "Vanderbilt Unity Poll",
360
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
361
+ "response_label": "Somewhat confident",
362
+ "pct": "24",
363
+ "count": ""
364
+ },
365
+ "2025_February_33": {
366
+ "topline_id": "2025_February_33",
367
+ "poll_date": "2025-February",
368
+ "year": "2025",
369
+ "month": "February",
370
+ "survey_name": "Vanderbilt Unity Poll",
371
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
372
+ "response_label": "Just a little confident",
373
+ "pct": "31",
374
+ "count": ""
375
+ },
376
+ "2025_February_34": {
377
+ "topline_id": "2025_February_34",
378
+ "poll_date": "2025-February",
379
+ "year": "2025",
380
+ "month": "February",
381
+ "survey_name": "Vanderbilt Unity Poll",
382
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
383
+ "response_label": "Not at all confident",
384
+ "pct": "39",
385
+ "count": ""
386
+ },
387
+ "2025_February_35": {
388
+ "topline_id": "2025_February_35",
389
+ "poll_date": "2025-February",
390
+ "year": "2025",
391
+ "month": "February",
392
+ "survey_name": "Vanderbilt Unity Poll",
393
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
394
+ "response_label": "Don't know",
395
+ "pct": "0",
396
+ "count": ""
397
+ },
398
+ "2025_February_36": {
399
+ "topline_id": "2025_February_36",
400
+ "poll_date": "2025-February",
401
+ "year": "2025",
402
+ "month": "February",
403
+ "survey_name": "Vanderbilt Unity Poll",
404
+ "variable_name": "In general, how confident are you that the political system in the United States today reflects the public\u2019s views on the pressing issues of the day?",
405
+ "response_label": "Refused",
406
+ "pct": "0",
407
+ "count": ""
408
+ },
409
+ "2025_February_37": {
410
+ "topline_id": "2025_February_37",
411
+ "poll_date": "2025-February",
412
+ "year": "2025",
413
+ "month": "February",
414
+ "survey_name": "Vanderbilt Unity Poll",
415
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
416
+ "response_label": "More of a supporter of the Make America Great Again or MAGA movement",
417
+ "pct": "52",
418
+ "count": ""
419
+ },
420
+ "2025_February_38": {
421
+ "topline_id": "2025_February_38",
422
+ "poll_date": "2025-February",
423
+ "year": "2025",
424
+ "month": "February",
425
+ "survey_name": "Vanderbilt Unity Poll",
426
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
427
+ "response_label": "More of a supporter of the Republican Party",
428
+ "pct": "48",
429
+ "count": ""
430
+ },
431
+ "2025_February_39": {
432
+ "topline_id": "2025_February_39",
433
+ "poll_date": "2025-February",
434
+ "year": "2025",
435
+ "month": "February",
436
+ "survey_name": "Vanderbilt Unity Poll",
437
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
438
+ "response_label": "Don't know",
439
+ "pct": "0",
440
+ "count": ""
441
+ },
442
+ "2025_February_40": {
443
+ "topline_id": "2025_February_40",
444
+ "poll_date": "2025-February",
445
+ "year": "2025",
446
+ "month": "February",
447
+ "survey_name": "Vanderbilt Unity Poll",
448
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
449
+ "response_label": "Refused",
450
+ "pct": "0",
451
+ "count": ""
452
+ },
453
+ "2025_June_0": {
454
+ "topline_id": "2025_June_0",
455
+ "poll_date": "2025-June",
456
+ "year": "2025",
457
+ "month": "June",
458
+ "survey_name": "Vanderbilt Unity Poll",
459
+ "variable_name": "Which of the following topics are you most interested in?",
460
+ "response_label": "Politics",
461
+ "pct": "11",
462
+ "count": ""
463
+ },
464
+ "2025_June_1": {
465
+ "topline_id": "2025_June_1",
466
+ "poll_date": "2025-June",
467
+ "year": "2025",
468
+ "month": "June",
469
+ "survey_name": "Vanderbilt Unity Poll",
470
+ "variable_name": "Which of the following topics are you most interested in?",
471
+ "response_label": "Sports",
472
+ "pct": "13",
473
+ "count": ""
474
+ },
475
+ "2025_June_2": {
476
+ "topline_id": "2025_June_2",
477
+ "poll_date": "2025-June",
478
+ "year": "2025",
479
+ "month": "June",
480
+ "survey_name": "Vanderbilt Unity Poll",
481
+ "variable_name": "Which of the following topics are you most interested in?",
482
+ "response_label": "Culture and entertainment",
483
+ "pct": "10",
484
+ "count": ""
485
+ },
486
+ "2025_June_3": {
487
+ "topline_id": "2025_June_3",
488
+ "poll_date": "2025-June",
489
+ "year": "2025",
490
+ "month": "June",
491
+ "survey_name": "Vanderbilt Unity Poll",
492
+ "variable_name": "Which of the following topics are you most interested in?",
493
+ "response_label": "Personal finance and money",
494
+ "pct": "13",
495
+ "count": ""
496
+ },
497
+ "2025_June_4": {
498
+ "topline_id": "2025_June_4",
499
+ "poll_date": "2025-June",
500
+ "year": "2025",
501
+ "month": "June",
502
+ "survey_name": "Vanderbilt Unity Poll",
503
+ "variable_name": "Which of the following topics are you most interested in?",
504
+ "response_label": "Lifestyle issues",
505
+ "pct": "6",
506
+ "count": ""
507
+ },
508
+ "2025_June_5": {
509
+ "topline_id": "2025_June_5",
510
+ "poll_date": "2025-June",
511
+ "year": "2025",
512
+ "month": "June",
513
+ "survey_name": "Vanderbilt Unity Poll",
514
+ "variable_name": "Which of the following topics are you most interested in?",
515
+ "response_label": "Travel",
516
+ "pct": "11",
517
+ "count": ""
518
+ },
519
+ "2025_June_6": {
520
+ "topline_id": "2025_June_6",
521
+ "poll_date": "2025-June",
522
+ "year": "2025",
523
+ "month": "June",
524
+ "survey_name": "Vanderbilt Unity Poll",
525
+ "variable_name": "Which of the following topics are you most interested in?",
526
+ "response_label": "Health and wellness",
527
+ "pct": "25",
528
+ "count": ""
529
+ },
530
+ "2025_June_7": {
531
+ "topline_id": "2025_June_7",
532
+ "poll_date": "2025-June",
533
+ "year": "2025",
534
+ "month": "June",
535
+ "survey_name": "Vanderbilt Unity Poll",
536
+ "variable_name": "Which of the following topics are you most interested in?",
537
+ "response_label": "Technology and innovation",
538
+ "pct": "11",
539
+ "count": ""
540
+ },
541
+ "2025_June_8": {
542
+ "topline_id": "2025_June_8",
543
+ "poll_date": "2025-June",
544
+ "year": "2025",
545
+ "month": "June",
546
+ "survey_name": "Vanderbilt Unity Poll",
547
+ "variable_name": "Which of the following topics are you most interested in?",
548
+ "response_label": "Don't know",
549
+ "pct": "0",
550
+ "count": ""
551
+ },
552
+ "2025_June_9": {
553
+ "topline_id": "2025_June_9",
554
+ "poll_date": "2025-June",
555
+ "year": "2025",
556
+ "month": "June",
557
+ "survey_name": "Vanderbilt Unity Poll",
558
+ "variable_name": "Which of the following topics are you most interested in?",
559
+ "response_label": "Refused",
560
+ "pct": "0",
561
+ "count": ""
562
+ },
563
+ "2025_June_10": {
564
+ "topline_id": "2025_June_10",
565
+ "poll_date": "2025-June",
566
+ "year": "2025",
567
+ "month": "June",
568
+ "survey_name": "Vanderbilt Unity Poll",
569
+ "variable_name": "Compared to one year ago, is your personal financial situation today:",
570
+ "response_label": "Better",
571
+ "pct": "24",
572
+ "count": ""
573
+ },
574
+ "2025_June_11": {
575
+ "topline_id": "2025_June_11",
576
+ "poll_date": "2025-June",
577
+ "year": "2025",
578
+ "month": "June",
579
+ "survey_name": "Vanderbilt Unity Poll",
580
+ "variable_name": "Compared to one year ago, is your personal financial situation today:",
581
+ "response_label": "About the same",
582
+ "pct": "49",
583
+ "count": ""
584
+ },
585
+ "2025_June_12": {
586
+ "topline_id": "2025_June_12",
587
+ "poll_date": "2025-June",
588
+ "year": "2025",
589
+ "month": "June",
590
+ "survey_name": "Vanderbilt Unity Poll",
591
+ "variable_name": "Compared to one year ago, is your personal financial situation today:",
592
+ "response_label": "Worse",
593
+ "pct": "27",
594
+ "count": ""
595
+ },
596
+ "2025_June_13": {
597
+ "topline_id": "2025_June_13",
598
+ "poll_date": "2025-June",
599
+ "year": "2025",
600
+ "month": "June",
601
+ "survey_name": "Vanderbilt Unity Poll",
602
+ "variable_name": "Compared to one year ago, is your personal financial situation today:",
603
+ "response_label": "Don't know",
604
+ "pct": "0",
605
+ "count": ""
606
+ },
607
+ "2025_June_14": {
608
+ "topline_id": "2025_June_14",
609
+ "poll_date": "2025-June",
610
+ "year": "2025",
611
+ "month": "June",
612
+ "survey_name": "Vanderbilt Unity Poll",
613
+ "variable_name": "Compared to one year ago, is your personal financial situation today:",
614
+ "response_label": "Refused",
615
+ "pct": "0",
616
+ "count": ""
617
+ },
618
+ "2025_June_15": {
619
+ "topline_id": "2025_June_15",
620
+ "poll_date": "2025-June",
621
+ "year": "2025",
622
+ "month": "June",
623
+ "survey_name": "Vanderbilt Unity Poll",
624
+ "variable_name": "Compared to one year ago, is the level of economic uncertainty you face today:",
625
+ "response_label": "Better",
626
+ "pct": "15",
627
+ "count": ""
628
+ },
629
+ "2025_June_16": {
630
+ "topline_id": "2025_June_16",
631
+ "poll_date": "2025-June",
632
+ "year": "2025",
633
+ "month": "June",
634
+ "survey_name": "Vanderbilt Unity Poll",
635
+ "variable_name": "Compared to one year ago, is the level of economic uncertainty you face today:",
636
+ "response_label": "About the same",
637
+ "pct": "35",
638
+ "count": ""
639
+ },
640
+ "2025_June_17": {
641
+ "topline_id": "2025_June_17",
642
+ "poll_date": "2025-June",
643
+ "year": "2025",
644
+ "month": "June",
645
+ "survey_name": "Vanderbilt Unity Poll",
646
+ "variable_name": "Compared to one year ago, is the level of economic uncertainty you face today:",
647
+ "response_label": "Worse",
648
+ "pct": "50",
649
+ "count": ""
650
+ },
651
+ "2025_June_18": {
652
+ "topline_id": "2025_June_18",
653
+ "poll_date": "2025-June",
654
+ "year": "2025",
655
+ "month": "June",
656
+ "survey_name": "Vanderbilt Unity Poll",
657
+ "variable_name": "Compared to one year ago, is the level of economic uncertainty you face today:",
658
+ "response_label": "Don't know",
659
+ "pct": "0",
660
+ "count": ""
661
+ },
662
+ "2025_June_19": {
663
+ "topline_id": "2025_June_19",
664
+ "poll_date": "2025-June",
665
+ "year": "2025",
666
+ "month": "June",
667
+ "survey_name": "Vanderbilt Unity Poll",
668
+ "variable_name": "Compared to one year ago, is the level of economic uncertainty you face today:",
669
+ "response_label": "Refused",
670
+ "pct": "0",
671
+ "count": ""
672
+ },
673
+ "2025_June_20": {
674
+ "topline_id": "2025_June_20",
675
+ "poll_date": "2025-June",
676
+ "year": "2025",
677
+ "month": "June",
678
+ "survey_name": "Vanderbilt Unity Poll",
679
+ "variable_name": "Now thinking ahead, in the next year, do you think President Trump\u2019s tariff policies will make your personal finances:",
680
+ "response_label": "Better",
681
+ "pct": "14",
682
+ "count": ""
683
+ },
684
+ "2025_June_21": {
685
+ "topline_id": "2025_June_21",
686
+ "poll_date": "2025-June",
687
+ "year": "2025",
688
+ "month": "June",
689
+ "survey_name": "Vanderbilt Unity Poll",
690
+ "variable_name": "Now thinking ahead, in the next year, do you think President Trump\u2019s tariff policies will make your personal finances:",
691
+ "response_label": "About the same",
692
+ "pct": "28",
693
+ "count": ""
694
+ },
695
+ "2025_June_22": {
696
+ "topline_id": "2025_June_22",
697
+ "poll_date": "2025-June",
698
+ "year": "2025",
699
+ "month": "June",
700
+ "survey_name": "Vanderbilt Unity Poll",
701
+ "variable_name": "Now thinking ahead, in the next year, do you think President Trump\u2019s tariff policies will make your personal finances:",
702
+ "response_label": "Worse",
703
+ "pct": "58",
704
+ "count": ""
705
+ },
706
+ "2025_June_23": {
707
+ "topline_id": "2025_June_23",
708
+ "poll_date": "2025-June",
709
+ "year": "2025",
710
+ "month": "June",
711
+ "survey_name": "Vanderbilt Unity Poll",
712
+ "variable_name": "Now thinking ahead, in the next year, do you think President Trump\u2019s tariff policies will make your personal finances:",
713
+ "response_label": "Don't know",
714
+ "pct": "0",
715
+ "count": ""
716
+ },
717
+ "2025_June_24": {
718
+ "topline_id": "2025_June_24",
719
+ "poll_date": "2025-June",
720
+ "year": "2025",
721
+ "month": "June",
722
+ "survey_name": "Vanderbilt Unity Poll",
723
+ "variable_name": "Now thinking ahead, in the next year, do you think President Trump\u2019s tariff policies will make your personal finances:",
724
+ "response_label": "Refused",
725
+ "pct": "0",
726
+ "count": ""
727
+ },
728
+ "2025_June_25": {
729
+ "topline_id": "2025_June_25",
730
+ "poll_date": "2025-June",
731
+ "year": "2025",
732
+ "month": "June",
733
+ "survey_name": "Vanderbilt Unity Poll",
734
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
735
+ "response_label": "Concerned NET",
736
+ "pct": "33",
737
+ "count": ""
738
+ },
739
+ "2025_June_26": {
740
+ "topline_id": "2025_June_26",
741
+ "poll_date": "2025-June",
742
+ "year": "2025",
743
+ "month": "June",
744
+ "survey_name": "Vanderbilt Unity Poll",
745
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
746
+ "response_label": "Not concerned NET",
747
+ "pct": "33",
748
+ "count": ""
749
+ },
750
+ "2025_June_27": {
751
+ "topline_id": "2025_June_27",
752
+ "poll_date": "2025-June",
753
+ "year": "2025",
754
+ "month": "June",
755
+ "survey_name": "Vanderbilt Unity Poll",
756
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
757
+ "response_label": "Extremely concerned",
758
+ "pct": "15",
759
+ "count": ""
760
+ },
761
+ "2025_June_28": {
762
+ "topline_id": "2025_June_28",
763
+ "poll_date": "2025-June",
764
+ "year": "2025",
765
+ "month": "June",
766
+ "survey_name": "Vanderbilt Unity Poll",
767
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
768
+ "response_label": "Very concerned",
769
+ "pct": "18",
770
+ "count": ""
771
+ },
772
+ "2025_June_29": {
773
+ "topline_id": "2025_June_29",
774
+ "poll_date": "2025-June",
775
+ "year": "2025",
776
+ "month": "June",
777
+ "survey_name": "Vanderbilt Unity Poll",
778
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
779
+ "response_label": "Somewhat concerned",
780
+ "pct": "33",
781
+ "count": ""
782
+ },
783
+ "2025_June_30": {
784
+ "topline_id": "2025_June_30",
785
+ "poll_date": "2025-June",
786
+ "year": "2025",
787
+ "month": "June",
788
+ "survey_name": "Vanderbilt Unity Poll",
789
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
790
+ "response_label": "Not too concerned",
791
+ "pct": "22",
792
+ "count": ""
793
+ },
794
+ "2025_June_31": {
795
+ "topline_id": "2025_June_31",
796
+ "poll_date": "2025-June",
797
+ "year": "2025",
798
+ "month": "June",
799
+ "survey_name": "Vanderbilt Unity Poll",
800
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
801
+ "response_label": "Not at all concerned",
802
+ "pct": "11",
803
+ "count": ""
804
+ },
805
+ "2025_June_32": {
806
+ "topline_id": "2025_June_32",
807
+ "poll_date": "2025-June",
808
+ "year": "2025",
809
+ "month": "June",
810
+ "survey_name": "Vanderbilt Unity Poll",
811
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
812
+ "response_label": "Don't know",
813
+ "pct": "0",
814
+ "count": ""
815
+ },
816
+ "2025_June_33": {
817
+ "topline_id": "2025_June_33",
818
+ "poll_date": "2025-June",
819
+ "year": "2025",
820
+ "month": "June",
821
+ "survey_name": "Vanderbilt Unity Poll",
822
+ "variable_name": "How concerned, if at all, are you about the recent instability of the stock market?",
823
+ "response_label": "Refused",
824
+ "pct": "0",
825
+ "count": ""
826
+ },
827
+ "2025_June_34": {
828
+ "topline_id": "2025_June_34",
829
+ "poll_date": "2025-June",
830
+ "year": "2025",
831
+ "month": "June",
832
+ "survey_name": "Vanderbilt Unity Poll",
833
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
834
+ "response_label": "Better NET",
835
+ "pct": "29",
836
+ "count": ""
837
+ },
838
+ "2025_June_35": {
839
+ "topline_id": "2025_June_35",
840
+ "poll_date": "2025-June",
841
+ "year": "2025",
842
+ "month": "June",
843
+ "survey_name": "Vanderbilt Unity Poll",
844
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
845
+ "response_label": "Worse NET",
846
+ "pct": "36",
847
+ "count": ""
848
+ },
849
+ "2025_June_36": {
850
+ "topline_id": "2025_June_36",
851
+ "poll_date": "2025-June",
852
+ "year": "2025",
853
+ "month": "June",
854
+ "survey_name": "Vanderbilt Unity Poll",
855
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
856
+ "response_label": "Much better",
857
+ "pct": "4",
858
+ "count": ""
859
+ },
860
+ "2025_June_37": {
861
+ "topline_id": "2025_June_37",
862
+ "poll_date": "2025-June",
863
+ "year": "2025",
864
+ "month": "June",
865
+ "survey_name": "Vanderbilt Unity Poll",
866
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
867
+ "response_label": "Somewhat better",
868
+ "pct": "25",
869
+ "count": ""
870
+ },
871
+ "2025_June_38": {
872
+ "topline_id": "2025_June_38",
873
+ "poll_date": "2025-June",
874
+ "year": "2025",
875
+ "month": "June",
876
+ "survey_name": "Vanderbilt Unity Poll",
877
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
878
+ "response_label": "Neither better nor worse",
879
+ "pct": "34",
880
+ "count": ""
881
+ },
882
+ "2025_June_39": {
883
+ "topline_id": "2025_June_39",
884
+ "poll_date": "2025-June",
885
+ "year": "2025",
886
+ "month": "June",
887
+ "survey_name": "Vanderbilt Unity Poll",
888
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
889
+ "response_label": "Somewhat worse",
890
+ "pct": "22",
891
+ "count": ""
892
+ },
893
+ "2025_June_40": {
894
+ "topline_id": "2025_June_40",
895
+ "poll_date": "2025-June",
896
+ "year": "2025",
897
+ "month": "June",
898
+ "survey_name": "Vanderbilt Unity Poll",
899
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
900
+ "response_label": "Much worse",
901
+ "pct": "14",
902
+ "count": ""
903
+ },
904
+ "2025_June_41": {
905
+ "topline_id": "2025_June_41",
906
+ "poll_date": "2025-June",
907
+ "year": "2025",
908
+ "month": "June",
909
+ "survey_name": "Vanderbilt Unity Poll",
910
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
911
+ "response_label": "Don't know",
912
+ "pct": "0",
913
+ "count": ""
914
+ },
915
+ "2025_June_42": {
916
+ "topline_id": "2025_June_42",
917
+ "poll_date": "2025-June",
918
+ "year": "2025",
919
+ "month": "June",
920
+ "survey_name": "Vanderbilt Unity Poll",
921
+ "variable_name": "In general, do you think artificial intelligence (AI) will make the life of you and your family:",
922
+ "response_label": "Refused",
923
+ "pct": "0",
924
+ "count": ""
925
+ },
926
+ "2025_June_43": {
927
+ "topline_id": "2025_June_43",
928
+ "poll_date": "2025-June",
929
+ "year": "2025",
930
+ "month": "June",
931
+ "survey_name": "Vanderbilt Unity Poll",
932
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
933
+ "response_label": "Satisfied NET",
934
+ "pct": "31",
935
+ "count": ""
936
+ },
937
+ "2025_June_44": {
938
+ "topline_id": "2025_June_44",
939
+ "poll_date": "2025-June",
940
+ "year": "2025",
941
+ "month": "June",
942
+ "survey_name": "Vanderbilt Unity Poll",
943
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
944
+ "response_label": "Dissatisfied NET",
945
+ "pct": "55",
946
+ "count": ""
947
+ },
948
+ "2025_June_45": {
949
+ "topline_id": "2025_June_45",
950
+ "poll_date": "2025-June",
951
+ "year": "2025",
952
+ "month": "June",
953
+ "survey_name": "Vanderbilt Unity Poll",
954
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
955
+ "response_label": "Enthusiastic",
956
+ "pct": "13",
957
+ "count": ""
958
+ },
959
+ "2025_June_46": {
960
+ "topline_id": "2025_June_46",
961
+ "poll_date": "2025-June",
962
+ "year": "2025",
963
+ "month": "June",
964
+ "survey_name": "Vanderbilt Unity Poll",
965
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
966
+ "response_label": "Satisfied, but not enthusiastic",
967
+ "pct": "18",
968
+ "count": ""
969
+ },
970
+ "2025_June_47": {
971
+ "topline_id": "2025_June_47",
972
+ "poll_date": "2025-June",
973
+ "year": "2025",
974
+ "month": "June",
975
+ "survey_name": "Vanderbilt Unity Poll",
976
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
977
+ "response_label": "Don\u2019t have a reaction either way",
978
+ "pct": "14",
979
+ "count": ""
980
+ },
981
+ "2025_June_48": {
982
+ "topline_id": "2025_June_48",
983
+ "poll_date": "2025-June",
984
+ "year": "2025",
985
+ "month": "June",
986
+ "survey_name": "Vanderbilt Unity Poll",
987
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
988
+ "response_label": "Dissatisfied, but not angry",
989
+ "pct": "22",
990
+ "count": ""
991
+ },
992
+ "2025_June_49": {
993
+ "topline_id": "2025_June_49",
994
+ "poll_date": "2025-June",
995
+ "year": "2025",
996
+ "month": "June",
997
+ "survey_name": "Vanderbilt Unity Poll",
998
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
999
+ "response_label": "Angry",
1000
+ "pct": "33",
1001
+ "count": ""
1002
+ },
1003
+ "2025_June_50": {
1004
+ "topline_id": "2025_June_50",
1005
+ "poll_date": "2025-June",
1006
+ "year": "2025",
1007
+ "month": "June",
1008
+ "survey_name": "Vanderbilt Unity Poll",
1009
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
1010
+ "response_label": "Don\u2019t know",
1011
+ "pct": "0",
1012
+ "count": ""
1013
+ },
1014
+ "2025_June_51": {
1015
+ "topline_id": "2025_June_51",
1016
+ "poll_date": "2025-June",
1017
+ "year": "2025",
1018
+ "month": "June",
1019
+ "survey_name": "Vanderbilt Unity Poll",
1020
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
1021
+ "response_label": "Refused",
1022
+ "pct": "0",
1023
+ "count": ""
1024
+ },
1025
+ "2025_June_52": {
1026
+ "topline_id": "2025_June_52",
1027
+ "poll_date": "2025-June",
1028
+ "year": "2025",
1029
+ "month": "June",
1030
+ "survey_name": "Vanderbilt Unity Poll",
1031
+ "variable_name": "Which of the following emotions best describes the way you currently feel about the actions the Trump administration has taken so far during its term?",
1032
+ "response_label": "Don't know",
1033
+ "pct": "0",
1034
+ "count": ""
1035
+ },
1036
+ "2025_June_53": {
1037
+ "topline_id": "2025_June_53",
1038
+ "poll_date": "2025-June",
1039
+ "year": "2025",
1040
+ "month": "June",
1041
+ "survey_name": "Vanderbilt Unity Poll",
1042
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1043
+ "response_label": "Strong/Somewhat approve NET",
1044
+ "pct": "40",
1045
+ "count": ""
1046
+ },
1047
+ "2025_June_54": {
1048
+ "topline_id": "2025_June_54",
1049
+ "poll_date": "2025-June",
1050
+ "year": "2025",
1051
+ "month": "June",
1052
+ "survey_name": "Vanderbilt Unity Poll",
1053
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1054
+ "response_label": "Somewhat/Strong disapprove NET",
1055
+ "pct": "59",
1056
+ "count": ""
1057
+ },
1058
+ "2025_June_55": {
1059
+ "topline_id": "2025_June_55",
1060
+ "poll_date": "2025-June",
1061
+ "year": "2025",
1062
+ "month": "June",
1063
+ "survey_name": "Vanderbilt Unity Poll",
1064
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1065
+ "response_label": "Strongly approve",
1066
+ "pct": "9",
1067
+ "count": ""
1068
+ },
1069
+ "2025_June_56": {
1070
+ "topline_id": "2025_June_56",
1071
+ "poll_date": "2025-June",
1072
+ "year": "2025",
1073
+ "month": "June",
1074
+ "survey_name": "Vanderbilt Unity Poll",
1075
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1076
+ "response_label": "Somewhat approve",
1077
+ "pct": "32",
1078
+ "count": ""
1079
+ },
1080
+ "2025_June_57": {
1081
+ "topline_id": "2025_June_57",
1082
+ "poll_date": "2025-June",
1083
+ "year": "2025",
1084
+ "month": "June",
1085
+ "survey_name": "Vanderbilt Unity Poll",
1086
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1087
+ "response_label": "Somewhat disapprove",
1088
+ "pct": "22",
1089
+ "count": ""
1090
+ },
1091
+ "2025_June_58": {
1092
+ "topline_id": "2025_June_58",
1093
+ "poll_date": "2025-June",
1094
+ "year": "2025",
1095
+ "month": "June",
1096
+ "survey_name": "Vanderbilt Unity Poll",
1097
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1098
+ "response_label": "Strongly disapprove",
1099
+ "pct": "38",
1100
+ "count": ""
1101
+ },
1102
+ "2025_June_59": {
1103
+ "topline_id": "2025_June_59",
1104
+ "poll_date": "2025-June",
1105
+ "year": "2025",
1106
+ "month": "June",
1107
+ "survey_name": "Vanderbilt Unity Poll",
1108
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1109
+ "response_label": "Don't know",
1110
+ "pct": "0",
1111
+ "count": ""
1112
+ },
1113
+ "2025_June_60": {
1114
+ "topline_id": "2025_June_60",
1115
+ "poll_date": "2025-June",
1116
+ "year": "2025",
1117
+ "month": "June",
1118
+ "survey_name": "Vanderbilt Unity Poll",
1119
+ "variable_name": "Do you approve or disapprove of the job that Joe Biden did as president?",
1120
+ "response_label": "Refused",
1121
+ "pct": "0",
1122
+ "count": ""
1123
+ },
1124
+ "2025_June_61": {
1125
+ "topline_id": "2025_June_61",
1126
+ "poll_date": "2025-June",
1127
+ "year": "2025",
1128
+ "month": "June",
1129
+ "survey_name": "Vanderbilt Unity Poll",
1130
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1131
+ "response_label": "Strong/Somewhat support NET",
1132
+ "pct": "39",
1133
+ "count": ""
1134
+ },
1135
+ "2025_June_62": {
1136
+ "topline_id": "2025_June_62",
1137
+ "poll_date": "2025-June",
1138
+ "year": "2025",
1139
+ "month": "June",
1140
+ "survey_name": "Vanderbilt Unity Poll",
1141
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1142
+ "response_label": "Somewhat/Strong oppose NET",
1143
+ "pct": "46",
1144
+ "count": ""
1145
+ },
1146
+ "2025_June_63": {
1147
+ "topline_id": "2025_June_63",
1148
+ "poll_date": "2025-June",
1149
+ "year": "2025",
1150
+ "month": "June",
1151
+ "survey_name": "Vanderbilt Unity Poll",
1152
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1153
+ "response_label": "Strongly support",
1154
+ "pct": "21",
1155
+ "count": ""
1156
+ },
1157
+ "2025_June_64": {
1158
+ "topline_id": "2025_June_64",
1159
+ "poll_date": "2025-June",
1160
+ "year": "2025",
1161
+ "month": "June",
1162
+ "survey_name": "Vanderbilt Unity Poll",
1163
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1164
+ "response_label": "Somewhat support",
1165
+ "pct": "18",
1166
+ "count": ""
1167
+ },
1168
+ "2025_June_65": {
1169
+ "topline_id": "2025_June_65",
1170
+ "poll_date": "2025-June",
1171
+ "year": "2025",
1172
+ "month": "June",
1173
+ "survey_name": "Vanderbilt Unity Poll",
1174
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1175
+ "response_label": "Neither support nor oppose",
1176
+ "pct": "15",
1177
+ "count": ""
1178
+ },
1179
+ "2025_June_66": {
1180
+ "topline_id": "2025_June_66",
1181
+ "poll_date": "2025-June",
1182
+ "year": "2025",
1183
+ "month": "June",
1184
+ "survey_name": "Vanderbilt Unity Poll",
1185
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1186
+ "response_label": "Somewhat oppose",
1187
+ "pct": "11",
1188
+ "count": ""
1189
+ },
1190
+ "2025_June_67": {
1191
+ "topline_id": "2025_June_67",
1192
+ "poll_date": "2025-June",
1193
+ "year": "2025",
1194
+ "month": "June",
1195
+ "survey_name": "Vanderbilt Unity Poll",
1196
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1197
+ "response_label": "Strongly oppose",
1198
+ "pct": "35",
1199
+ "count": ""
1200
+ },
1201
+ "2025_June_68": {
1202
+ "topline_id": "2025_June_68",
1203
+ "poll_date": "2025-June",
1204
+ "year": "2025",
1205
+ "month": "June",
1206
+ "survey_name": "Vanderbilt Unity Poll",
1207
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1208
+ "response_label": "Don't know",
1209
+ "pct": "0",
1210
+ "count": ""
1211
+ },
1212
+ "2025_June_69": {
1213
+ "topline_id": "2025_June_69",
1214
+ "poll_date": "2025-June",
1215
+ "year": "2025",
1216
+ "month": "June",
1217
+ "survey_name": "Vanderbilt Unity Poll",
1218
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries without a formal hearing before a judge in an immigration court?",
1219
+ "response_label": "Refused",
1220
+ "pct": "0",
1221
+ "count": ""
1222
+ },
1223
+ "2025_June_70": {
1224
+ "topline_id": "2025_June_70",
1225
+ "poll_date": "2025-June",
1226
+ "year": "2025",
1227
+ "month": "June",
1228
+ "survey_name": "Vanderbilt Unity Poll",
1229
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1230
+ "response_label": "Strong/Somewhat support NET",
1231
+ "pct": "51",
1232
+ "count": ""
1233
+ },
1234
+ "2025_June_71": {
1235
+ "topline_id": "2025_June_71",
1236
+ "poll_date": "2025-June",
1237
+ "year": "2025",
1238
+ "month": "June",
1239
+ "survey_name": "Vanderbilt Unity Poll",
1240
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1241
+ "response_label": "Somewhat/Strong oppose NET",
1242
+ "pct": "31",
1243
+ "count": ""
1244
+ },
1245
+ "2025_June_72": {
1246
+ "topline_id": "2025_June_72",
1247
+ "poll_date": "2025-June",
1248
+ "year": "2025",
1249
+ "month": "June",
1250
+ "survey_name": "Vanderbilt Unity Poll",
1251
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1252
+ "response_label": "Strongly support",
1253
+ "pct": "27",
1254
+ "count": ""
1255
+ },
1256
+ "2025_June_73": {
1257
+ "topline_id": "2025_June_73",
1258
+ "poll_date": "2025-June",
1259
+ "year": "2025",
1260
+ "month": "June",
1261
+ "survey_name": "Vanderbilt Unity Poll",
1262
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1263
+ "response_label": "Somewhat support",
1264
+ "pct": "24",
1265
+ "count": ""
1266
+ },
1267
+ "2025_June_74": {
1268
+ "topline_id": "2025_June_74",
1269
+ "poll_date": "2025-June",
1270
+ "year": "2025",
1271
+ "month": "June",
1272
+ "survey_name": "Vanderbilt Unity Poll",
1273
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1274
+ "response_label": "Neither support nor oppose",
1275
+ "pct": "18",
1276
+ "count": ""
1277
+ },
1278
+ "2025_June_75": {
1279
+ "topline_id": "2025_June_75",
1280
+ "poll_date": "2025-June",
1281
+ "year": "2025",
1282
+ "month": "June",
1283
+ "survey_name": "Vanderbilt Unity Poll",
1284
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1285
+ "response_label": "Somewhat oppose",
1286
+ "pct": "14",
1287
+ "count": ""
1288
+ },
1289
+ "2025_June_76": {
1290
+ "topline_id": "2025_June_76",
1291
+ "poll_date": "2025-June",
1292
+ "year": "2025",
1293
+ "month": "June",
1294
+ "survey_name": "Vanderbilt Unity Poll",
1295
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1296
+ "response_label": "Strongly oppose",
1297
+ "pct": "17",
1298
+ "count": ""
1299
+ },
1300
+ "2025_June_77": {
1301
+ "topline_id": "2025_June_77",
1302
+ "poll_date": "2025-June",
1303
+ "year": "2025",
1304
+ "month": "June",
1305
+ "survey_name": "Vanderbilt Unity Poll",
1306
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1307
+ "response_label": "Don't know",
1308
+ "pct": "0",
1309
+ "count": ""
1310
+ },
1311
+ "2025_June_78": {
1312
+ "topline_id": "2025_June_78",
1313
+ "poll_date": "2025-June",
1314
+ "year": "2025",
1315
+ "month": "June",
1316
+ "survey_name": "Vanderbilt Unity Poll",
1317
+ "variable_name": "How much do you support or oppose deporting individuals who are living in the United States illegally back to their home countries?",
1318
+ "response_label": "Refused",
1319
+ "pct": "0",
1320
+ "count": ""
1321
+ },
1322
+ "2025_June_79": {
1323
+ "topline_id": "2025_June_79",
1324
+ "poll_date": "2025-June",
1325
+ "year": "2025",
1326
+ "month": "June",
1327
+ "survey_name": "Vanderbilt Unity Poll",
1328
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1329
+ "response_label": "Important NET",
1330
+ "pct": "76",
1331
+ "count": ""
1332
+ },
1333
+ "2025_June_80": {
1334
+ "topline_id": "2025_June_80",
1335
+ "poll_date": "2025-June",
1336
+ "year": "2025",
1337
+ "month": "June",
1338
+ "survey_name": "Vanderbilt Unity Poll",
1339
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1340
+ "response_label": "Not important NET",
1341
+ "pct": "24",
1342
+ "count": ""
1343
+ },
1344
+ "2025_June_81": {
1345
+ "topline_id": "2025_June_81",
1346
+ "poll_date": "2025-June",
1347
+ "year": "2025",
1348
+ "month": "June",
1349
+ "survey_name": "Vanderbilt Unity Poll",
1350
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1351
+ "response_label": "Very important",
1352
+ "pct": "31",
1353
+ "count": ""
1354
+ },
1355
+ "2025_June_82": {
1356
+ "topline_id": "2025_June_82",
1357
+ "poll_date": "2025-June",
1358
+ "year": "2025",
1359
+ "month": "June",
1360
+ "survey_name": "Vanderbilt Unity Poll",
1361
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1362
+ "response_label": "Somewhat important",
1363
+ "pct": "46",
1364
+ "count": ""
1365
+ },
1366
+ "2025_June_83": {
1367
+ "topline_id": "2025_June_83",
1368
+ "poll_date": "2025-June",
1369
+ "year": "2025",
1370
+ "month": "June",
1371
+ "survey_name": "Vanderbilt Unity Poll",
1372
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1373
+ "response_label": "Not too important",
1374
+ "pct": "19",
1375
+ "count": ""
1376
+ },
1377
+ "2025_June_84": {
1378
+ "topline_id": "2025_June_84",
1379
+ "poll_date": "2025-June",
1380
+ "year": "2025",
1381
+ "month": "June",
1382
+ "survey_name": "Vanderbilt Unity Poll",
1383
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1384
+ "response_label": "Not at all important",
1385
+ "pct": "4",
1386
+ "count": ""
1387
+ },
1388
+ "2025_June_85": {
1389
+ "topline_id": "2025_June_85",
1390
+ "poll_date": "2025-June",
1391
+ "year": "2025",
1392
+ "month": "June",
1393
+ "survey_name": "Vanderbilt Unity Poll",
1394
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1395
+ "response_label": "Don't know",
1396
+ "pct": "0",
1397
+ "count": ""
1398
+ },
1399
+ "2025_June_86": {
1400
+ "topline_id": "2025_June_86",
1401
+ "poll_date": "2025-June",
1402
+ "year": "2025",
1403
+ "month": "June",
1404
+ "survey_name": "Vanderbilt Unity Poll",
1405
+ "variable_name": "How important is a college education for a young person to succeed in the world today?",
1406
+ "response_label": "Refused",
1407
+ "pct": "0",
1408
+ "count": ""
1409
+ },
1410
+ "2025_June_87": {
1411
+ "topline_id": "2025_June_87",
1412
+ "poll_date": "2025-June",
1413
+ "year": "2025",
1414
+ "month": "June",
1415
+ "survey_name": "Vanderbilt Unity Poll",
1416
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1417
+ "response_label": "Strong/Somewhat approve NET",
1418
+ "pct": "32",
1419
+ "count": ""
1420
+ },
1421
+ "2025_June_88": {
1422
+ "topline_id": "2025_June_88",
1423
+ "poll_date": "2025-June",
1424
+ "year": "2025",
1425
+ "month": "June",
1426
+ "survey_name": "Vanderbilt Unity Poll",
1427
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1428
+ "response_label": "Somewhat/Strong disapprove NET",
1429
+ "pct": "68",
1430
+ "count": ""
1431
+ },
1432
+ "2025_June_89": {
1433
+ "topline_id": "2025_June_89",
1434
+ "poll_date": "2025-June",
1435
+ "year": "2025",
1436
+ "month": "June",
1437
+ "survey_name": "Vanderbilt Unity Poll",
1438
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1439
+ "response_label": "Strongly approve",
1440
+ "pct": "15",
1441
+ "count": ""
1442
+ },
1443
+ "2025_June_90": {
1444
+ "topline_id": "2025_June_90",
1445
+ "poll_date": "2025-June",
1446
+ "year": "2025",
1447
+ "month": "June",
1448
+ "survey_name": "Vanderbilt Unity Poll",
1449
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1450
+ "response_label": "Somewhat approve",
1451
+ "pct": "17",
1452
+ "count": ""
1453
+ },
1454
+ "2025_June_91": {
1455
+ "topline_id": "2025_June_91",
1456
+ "poll_date": "2025-June",
1457
+ "year": "2025",
1458
+ "month": "June",
1459
+ "survey_name": "Vanderbilt Unity Poll",
1460
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1461
+ "response_label": "Somewhat disapprove",
1462
+ "pct": "25",
1463
+ "count": ""
1464
+ },
1465
+ "2025_June_92": {
1466
+ "topline_id": "2025_June_92",
1467
+ "poll_date": "2025-June",
1468
+ "year": "2025",
1469
+ "month": "June",
1470
+ "survey_name": "Vanderbilt Unity Poll",
1471
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1472
+ "response_label": "Strongly disapprove",
1473
+ "pct": "43",
1474
+ "count": ""
1475
+ },
1476
+ "2025_June_93": {
1477
+ "topline_id": "2025_June_93",
1478
+ "poll_date": "2025-June",
1479
+ "year": "2025",
1480
+ "month": "June",
1481
+ "survey_name": "Vanderbilt Unity Poll",
1482
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1483
+ "response_label": "Don't know",
1484
+ "pct": "0",
1485
+ "count": ""
1486
+ },
1487
+ "2025_June_94": {
1488
+ "topline_id": "2025_June_94",
1489
+ "poll_date": "2025-June",
1490
+ "year": "2025",
1491
+ "month": "June",
1492
+ "survey_name": "Vanderbilt Unity Poll",
1493
+ "variable_name": "Do you approve or disapprove of cutting resources to and ultimately eliminating the Department of Education?",
1494
+ "response_label": "Refused",
1495
+ "pct": "0",
1496
+ "count": ""
1497
+ },
1498
+ "2025_June_95": {
1499
+ "topline_id": "2025_June_95",
1500
+ "poll_date": "2025-June",
1501
+ "year": "2025",
1502
+ "month": "June",
1503
+ "survey_name": "Vanderbilt Unity Poll",
1504
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1505
+ "response_label": "Strong/Somewhat approve NET",
1506
+ "pct": "39",
1507
+ "count": ""
1508
+ },
1509
+ "2025_June_96": {
1510
+ "topline_id": "2025_June_96",
1511
+ "poll_date": "2025-June",
1512
+ "year": "2025",
1513
+ "month": "June",
1514
+ "survey_name": "Vanderbilt Unity Poll",
1515
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1516
+ "response_label": "Somewhat/Strong disapprove NET",
1517
+ "pct": "61",
1518
+ "count": ""
1519
+ },
1520
+ "2025_June_97": {
1521
+ "topline_id": "2025_June_97",
1522
+ "poll_date": "2025-June",
1523
+ "year": "2025",
1524
+ "month": "June",
1525
+ "survey_name": "Vanderbilt Unity Poll",
1526
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1527
+ "response_label": "Strongly approve",
1528
+ "pct": "14",
1529
+ "count": ""
1530
+ },
1531
+ "2025_June_98": {
1532
+ "topline_id": "2025_June_98",
1533
+ "poll_date": "2025-June",
1534
+ "year": "2025",
1535
+ "month": "June",
1536
+ "survey_name": "Vanderbilt Unity Poll",
1537
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1538
+ "response_label": "Somewhat approve",
1539
+ "pct": "25",
1540
+ "count": ""
1541
+ },
1542
+ "2025_June_99": {
1543
+ "topline_id": "2025_June_99",
1544
+ "poll_date": "2025-June",
1545
+ "year": "2025",
1546
+ "month": "June",
1547
+ "survey_name": "Vanderbilt Unity Poll",
1548
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1549
+ "response_label": "Somewhat disapprove",
1550
+ "pct": "24",
1551
+ "count": ""
1552
+ },
1553
+ "2025_June_100": {
1554
+ "topline_id": "2025_June_100",
1555
+ "poll_date": "2025-June",
1556
+ "year": "2025",
1557
+ "month": "June",
1558
+ "survey_name": "Vanderbilt Unity Poll",
1559
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1560
+ "response_label": "Strongly disapprove",
1561
+ "pct": "37",
1562
+ "count": ""
1563
+ },
1564
+ "2025_June_101": {
1565
+ "topline_id": "2025_June_101",
1566
+ "poll_date": "2025-June",
1567
+ "year": "2025",
1568
+ "month": "June",
1569
+ "survey_name": "Vanderbilt Unity Poll",
1570
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1571
+ "response_label": "Don't know",
1572
+ "pct": "0",
1573
+ "count": ""
1574
+ },
1575
+ "2025_June_102": {
1576
+ "topline_id": "2025_June_102",
1577
+ "poll_date": "2025-June",
1578
+ "year": "2025",
1579
+ "month": "June",
1580
+ "survey_name": "Vanderbilt Unity Poll",
1581
+ "variable_name": "Do you approve or disapprove of the way Donald Trump is handling issues related to colleges and universities?",
1582
+ "response_label": "Refused",
1583
+ "pct": "0",
1584
+ "count": ""
1585
+ },
1586
+ "2025_June_103": {
1587
+ "topline_id": "2025_June_103",
1588
+ "poll_date": "2025-June",
1589
+ "year": "2025",
1590
+ "month": "June",
1591
+ "survey_name": "Vanderbilt Unity Poll",
1592
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1593
+ "response_label": "Confident NET",
1594
+ "pct": "46",
1595
+ "count": ""
1596
+ },
1597
+ "2025_June_104": {
1598
+ "topline_id": "2025_June_104",
1599
+ "poll_date": "2025-June",
1600
+ "year": "2025",
1601
+ "month": "June",
1602
+ "survey_name": "Vanderbilt Unity Poll",
1603
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1604
+ "response_label": "Not confident NET",
1605
+ "pct": "15",
1606
+ "count": ""
1607
+ },
1608
+ "2025_June_105": {
1609
+ "topline_id": "2025_June_105",
1610
+ "poll_date": "2025-June",
1611
+ "year": "2025",
1612
+ "month": "June",
1613
+ "survey_name": "Vanderbilt Unity Poll",
1614
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1615
+ "response_label": "A great deal",
1616
+ "pct": "12",
1617
+ "count": ""
1618
+ },
1619
+ "2025_June_106": {
1620
+ "topline_id": "2025_June_106",
1621
+ "poll_date": "2025-June",
1622
+ "year": "2025",
1623
+ "month": "June",
1624
+ "survey_name": "Vanderbilt Unity Poll",
1625
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1626
+ "response_label": "Quite a lot",
1627
+ "pct": "34",
1628
+ "count": ""
1629
+ },
1630
+ "2025_June_107": {
1631
+ "topline_id": "2025_June_107",
1632
+ "poll_date": "2025-June",
1633
+ "year": "2025",
1634
+ "month": "June",
1635
+ "survey_name": "Vanderbilt Unity Poll",
1636
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1637
+ "response_label": "Some",
1638
+ "pct": "39",
1639
+ "count": ""
1640
+ },
1641
+ "2025_June_108": {
1642
+ "topline_id": "2025_June_108",
1643
+ "poll_date": "2025-June",
1644
+ "year": "2025",
1645
+ "month": "June",
1646
+ "survey_name": "Vanderbilt Unity Poll",
1647
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1648
+ "response_label": "Very little",
1649
+ "pct": "9",
1650
+ "count": ""
1651
+ },
1652
+ "2025_June_109": {
1653
+ "topline_id": "2025_June_109",
1654
+ "poll_date": "2025-June",
1655
+ "year": "2025",
1656
+ "month": "June",
1657
+ "survey_name": "Vanderbilt Unity Poll",
1658
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1659
+ "response_label": "None at all",
1660
+ "pct": "6",
1661
+ "count": ""
1662
+ },
1663
+ "2025_June_110": {
1664
+ "topline_id": "2025_June_110",
1665
+ "poll_date": "2025-June",
1666
+ "year": "2025",
1667
+ "month": "June",
1668
+ "survey_name": "Vanderbilt Unity Poll",
1669
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1670
+ "response_label": "Don't know",
1671
+ "pct": "0",
1672
+ "count": ""
1673
+ },
1674
+ "2025_June_111": {
1675
+ "topline_id": "2025_June_111",
1676
+ "poll_date": "2025-June",
1677
+ "year": "2025",
1678
+ "month": "June",
1679
+ "survey_name": "Vanderbilt Unity Poll",
1680
+ "variable_name": "How much confidence do you have in public colleges and universities?",
1681
+ "response_label": "Refused",
1682
+ "pct": "0",
1683
+ "count": ""
1684
+ },
1685
+ "2025_June_112": {
1686
+ "topline_id": "2025_June_112",
1687
+ "poll_date": "2025-June",
1688
+ "year": "2025",
1689
+ "month": "June",
1690
+ "survey_name": "Vanderbilt Unity Poll",
1691
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1692
+ "response_label": "Confident NET",
1693
+ "pct": "30",
1694
+ "count": ""
1695
+ },
1696
+ "2025_June_113": {
1697
+ "topline_id": "2025_June_113",
1698
+ "poll_date": "2025-June",
1699
+ "year": "2025",
1700
+ "month": "June",
1701
+ "survey_name": "Vanderbilt Unity Poll",
1702
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1703
+ "response_label": "Not confident NET",
1704
+ "pct": "26",
1705
+ "count": ""
1706
+ },
1707
+ "2025_June_114": {
1708
+ "topline_id": "2025_June_114",
1709
+ "poll_date": "2025-June",
1710
+ "year": "2025",
1711
+ "month": "June",
1712
+ "survey_name": "Vanderbilt Unity Poll",
1713
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1714
+ "response_label": "A great deal",
1715
+ "pct": "11",
1716
+ "count": ""
1717
+ },
1718
+ "2025_June_115": {
1719
+ "topline_id": "2025_June_115",
1720
+ "poll_date": "2025-June",
1721
+ "year": "2025",
1722
+ "month": "June",
1723
+ "survey_name": "Vanderbilt Unity Poll",
1724
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1725
+ "response_label": "Quite a lot",
1726
+ "pct": "20",
1727
+ "count": ""
1728
+ },
1729
+ "2025_June_116": {
1730
+ "topline_id": "2025_June_116",
1731
+ "poll_date": "2025-June",
1732
+ "year": "2025",
1733
+ "month": "June",
1734
+ "survey_name": "Vanderbilt Unity Poll",
1735
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1736
+ "response_label": "Some",
1737
+ "pct": "44",
1738
+ "count": ""
1739
+ },
1740
+ "2025_June_117": {
1741
+ "topline_id": "2025_June_117",
1742
+ "poll_date": "2025-June",
1743
+ "year": "2025",
1744
+ "month": "June",
1745
+ "survey_name": "Vanderbilt Unity Poll",
1746
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1747
+ "response_label": "Very little",
1748
+ "pct": "20",
1749
+ "count": ""
1750
+ },
1751
+ "2025_June_118": {
1752
+ "topline_id": "2025_June_118",
1753
+ "poll_date": "2025-June",
1754
+ "year": "2025",
1755
+ "month": "June",
1756
+ "survey_name": "Vanderbilt Unity Poll",
1757
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1758
+ "response_label": "None at all",
1759
+ "pct": "6",
1760
+ "count": ""
1761
+ },
1762
+ "2025_June_119": {
1763
+ "topline_id": "2025_June_119",
1764
+ "poll_date": "2025-June",
1765
+ "year": "2025",
1766
+ "month": "June",
1767
+ "survey_name": "Vanderbilt Unity Poll",
1768
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1769
+ "response_label": "Don't know",
1770
+ "pct": "0",
1771
+ "count": ""
1772
+ },
1773
+ "2025_June_120": {
1774
+ "topline_id": "2025_June_120",
1775
+ "poll_date": "2025-June",
1776
+ "year": "2025",
1777
+ "month": "June",
1778
+ "survey_name": "Vanderbilt Unity Poll",
1779
+ "variable_name": "How much confidence do you have in private colleges and universities?",
1780
+ "response_label": "Refused",
1781
+ "pct": "0",
1782
+ "count": ""
1783
+ },
1784
+ "2025_June_121": {
1785
+ "topline_id": "2025_June_121",
1786
+ "poll_date": "2025-June",
1787
+ "year": "2025",
1788
+ "month": "June",
1789
+ "survey_name": "Vanderbilt Unity Poll",
1790
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1791
+ "response_label": "Strong/Somewhat support NET",
1792
+ "pct": "33",
1793
+ "count": ""
1794
+ },
1795
+ "2025_June_122": {
1796
+ "topline_id": "2025_June_122",
1797
+ "poll_date": "2025-June",
1798
+ "year": "2025",
1799
+ "month": "June",
1800
+ "survey_name": "Vanderbilt Unity Poll",
1801
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1802
+ "response_label": "Somewhat/Strong oppose NET",
1803
+ "pct": "40",
1804
+ "count": ""
1805
+ },
1806
+ "2025_June_123": {
1807
+ "topline_id": "2025_June_123",
1808
+ "poll_date": "2025-June",
1809
+ "year": "2025",
1810
+ "month": "June",
1811
+ "survey_name": "Vanderbilt Unity Poll",
1812
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1813
+ "response_label": "Strongly support",
1814
+ "pct": "14",
1815
+ "count": ""
1816
+ },
1817
+ "2025_June_124": {
1818
+ "topline_id": "2025_June_124",
1819
+ "poll_date": "2025-June",
1820
+ "year": "2025",
1821
+ "month": "June",
1822
+ "survey_name": "Vanderbilt Unity Poll",
1823
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1824
+ "response_label": "Somewhat support",
1825
+ "pct": "19",
1826
+ "count": ""
1827
+ },
1828
+ "2025_June_125": {
1829
+ "topline_id": "2025_June_125",
1830
+ "poll_date": "2025-June",
1831
+ "year": "2025",
1832
+ "month": "June",
1833
+ "survey_name": "Vanderbilt Unity Poll",
1834
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1835
+ "response_label": "Neither support nor oppose",
1836
+ "pct": "26",
1837
+ "count": ""
1838
+ },
1839
+ "2025_June_126": {
1840
+ "topline_id": "2025_June_126",
1841
+ "poll_date": "2025-June",
1842
+ "year": "2025",
1843
+ "month": "June",
1844
+ "survey_name": "Vanderbilt Unity Poll",
1845
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1846
+ "response_label": "Somewhat oppose",
1847
+ "pct": "14",
1848
+ "count": ""
1849
+ },
1850
+ "2025_June_127": {
1851
+ "topline_id": "2025_June_127",
1852
+ "poll_date": "2025-June",
1853
+ "year": "2025",
1854
+ "month": "June",
1855
+ "survey_name": "Vanderbilt Unity Poll",
1856
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1857
+ "response_label": "Strongly oppose",
1858
+ "pct": "27",
1859
+ "count": ""
1860
+ },
1861
+ "2025_June_128": {
1862
+ "topline_id": "2025_June_128",
1863
+ "poll_date": "2025-June",
1864
+ "year": "2025",
1865
+ "month": "June",
1866
+ "survey_name": "Vanderbilt Unity Poll",
1867
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1868
+ "response_label": "Don't know",
1869
+ "pct": "0",
1870
+ "count": ""
1871
+ },
1872
+ "2025_June_129": {
1873
+ "topline_id": "2025_June_129",
1874
+ "poll_date": "2025-June",
1875
+ "year": "2025",
1876
+ "month": "June",
1877
+ "survey_name": "Vanderbilt Unity Poll",
1878
+ "variable_name": "How much do you support or oppose the U.S. government monitoring the social media activity of current and prospective foreign students at U.S. colleges and universities?",
1879
+ "response_label": "Refused",
1880
+ "pct": "0",
1881
+ "count": ""
1882
+ },
1883
+ "2025_June_130": {
1884
+ "topline_id": "2025_June_130",
1885
+ "poll_date": "2025-June",
1886
+ "year": "2025",
1887
+ "month": "June",
1888
+ "survey_name": "Vanderbilt Unity Poll",
1889
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
1890
+ "response_label": "Foreign students who graduate from colleges and universities should be granted a legal green card so they can stay and work in the United States after they graduate.",
1891
+ "pct": "59",
1892
+ "count": ""
1893
+ },
1894
+ "2025_June_131": {
1895
+ "topline_id": "2025_June_131",
1896
+ "poll_date": "2025-June",
1897
+ "year": "2025",
1898
+ "month": "June",
1899
+ "survey_name": "Vanderbilt Unity Poll",
1900
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
1901
+ "response_label": "Foreign students who graduate but fail to secure a job that will sponsor their visa should be sent back to their home countries.",
1902
+ "pct": "40",
1903
+ "count": ""
1904
+ },
1905
+ "2025_June_132": {
1906
+ "topline_id": "2025_June_132",
1907
+ "poll_date": "2025-June",
1908
+ "year": "2025",
1909
+ "month": "June",
1910
+ "survey_name": "Vanderbilt Unity Poll",
1911
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
1912
+ "response_label": "Don\u2019t know",
1913
+ "pct": "0",
1914
+ "count": ""
1915
+ },
1916
+ "2025_June_133": {
1917
+ "topline_id": "2025_June_133",
1918
+ "poll_date": "2025-June",
1919
+ "year": "2025",
1920
+ "month": "June",
1921
+ "survey_name": "Vanderbilt Unity Poll",
1922
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
1923
+ "response_label": "Refused",
1924
+ "pct": "0",
1925
+ "count": ""
1926
+ },
1927
+ "2025_June_134": {
1928
+ "topline_id": "2025_June_134",
1929
+ "poll_date": "2025-June",
1930
+ "year": "2025",
1931
+ "month": "June",
1932
+ "survey_name": "Vanderbilt Unity Poll",
1933
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
1934
+ "response_label": "Don't know",
1935
+ "pct": "0",
1936
+ "count": ""
1937
+ },
1938
+ "2025_June_135": {
1939
+ "topline_id": "2025_June_135",
1940
+ "poll_date": "2025-June",
1941
+ "year": "2025",
1942
+ "month": "June",
1943
+ "survey_name": "Vanderbilt Unity Poll",
1944
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
1945
+ "response_label": "Strong/Somewhat support NET",
1946
+ "pct": "31",
1947
+ "count": ""
1948
+ },
1949
+ "2025_June_136": {
1950
+ "topline_id": "2025_June_136",
1951
+ "poll_date": "2025-June",
1952
+ "year": "2025",
1953
+ "month": "June",
1954
+ "survey_name": "Vanderbilt Unity Poll",
1955
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
1956
+ "response_label": "Somewhat/Strong oppose NET",
1957
+ "pct": "46",
1958
+ "count": ""
1959
+ },
1960
+ "2025_June_137": {
1961
+ "topline_id": "2025_June_137",
1962
+ "poll_date": "2025-June",
1963
+ "year": "2025",
1964
+ "month": "June",
1965
+ "survey_name": "Vanderbilt Unity Poll",
1966
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
1967
+ "response_label": "Strongly support",
1968
+ "pct": "16",
1969
+ "count": ""
1970
+ },
1971
+ "2025_June_138": {
1972
+ "topline_id": "2025_June_138",
1973
+ "poll_date": "2025-June",
1974
+ "year": "2025",
1975
+ "month": "June",
1976
+ "survey_name": "Vanderbilt Unity Poll",
1977
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
1978
+ "response_label": "Somewhat support",
1979
+ "pct": "15",
1980
+ "count": ""
1981
+ },
1982
+ "2025_June_139": {
1983
+ "topline_id": "2025_June_139",
1984
+ "poll_date": "2025-June",
1985
+ "year": "2025",
1986
+ "month": "June",
1987
+ "survey_name": "Vanderbilt Unity Poll",
1988
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
1989
+ "response_label": "Neither support nor oppose",
1990
+ "pct": "23",
1991
+ "count": ""
1992
+ },
1993
+ "2025_June_140": {
1994
+ "topline_id": "2025_June_140",
1995
+ "poll_date": "2025-June",
1996
+ "year": "2025",
1997
+ "month": "June",
1998
+ "survey_name": "Vanderbilt Unity Poll",
1999
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
2000
+ "response_label": "Somewhat oppose",
2001
+ "pct": "18",
2002
+ "count": ""
2003
+ },
2004
+ "2025_June_141": {
2005
+ "topline_id": "2025_June_141",
2006
+ "poll_date": "2025-June",
2007
+ "year": "2025",
2008
+ "month": "June",
2009
+ "survey_name": "Vanderbilt Unity Poll",
2010
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
2011
+ "response_label": "Strongly oppose",
2012
+ "pct": "28",
2013
+ "count": ""
2014
+ },
2015
+ "2025_June_142": {
2016
+ "topline_id": "2025_June_142",
2017
+ "poll_date": "2025-June",
2018
+ "year": "2025",
2019
+ "month": "June",
2020
+ "survey_name": "Vanderbilt Unity Poll",
2021
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
2022
+ "response_label": "Don't know",
2023
+ "pct": "0",
2024
+ "count": ""
2025
+ },
2026
+ "2025_June_143": {
2027
+ "topline_id": "2025_June_143",
2028
+ "poll_date": "2025-June",
2029
+ "year": "2025",
2030
+ "month": "June",
2031
+ "survey_name": "Vanderbilt Unity Poll",
2032
+ "variable_name": "The Trump administration announced it is moving to \"aggressively revoke\" visas of current international students from China who are attending U.S. colleges and universities. How much do you support or",
2033
+ "response_label": "Refused",
2034
+ "pct": "0",
2035
+ "count": ""
2036
+ },
2037
+ "2025_June_144": {
2038
+ "topline_id": "2025_June_144",
2039
+ "poll_date": "2025-June",
2040
+ "year": "2025",
2041
+ "month": "June",
2042
+ "survey_name": "Vanderbilt Unity Poll",
2043
+ "variable_name": "Do you think the President of the United States should be able to determine if colleges and universities can enroll foreign students?",
2044
+ "response_label": "Yes",
2045
+ "pct": "22",
2046
+ "count": ""
2047
+ },
2048
+ "2025_June_145": {
2049
+ "topline_id": "2025_June_145",
2050
+ "poll_date": "2025-June",
2051
+ "year": "2025",
2052
+ "month": "June",
2053
+ "survey_name": "Vanderbilt Unity Poll",
2054
+ "variable_name": "Do you think the President of the United States should be able to determine if colleges and universities can enroll foreign students?",
2055
+ "response_label": "No",
2056
+ "pct": "61",
2057
+ "count": ""
2058
+ },
2059
+ "2025_June_146": {
2060
+ "topline_id": "2025_June_146",
2061
+ "poll_date": "2025-June",
2062
+ "year": "2025",
2063
+ "month": "June",
2064
+ "survey_name": "Vanderbilt Unity Poll",
2065
+ "variable_name": "Do you think the President of the United States should be able to determine if colleges and universities can enroll foreign students?",
2066
+ "response_label": "Don\u2019t know",
2067
+ "pct": "17",
2068
+ "count": ""
2069
+ },
2070
+ "2025_June_147": {
2071
+ "topline_id": "2025_June_147",
2072
+ "poll_date": "2025-June",
2073
+ "year": "2025",
2074
+ "month": "June",
2075
+ "survey_name": "Vanderbilt Unity Poll",
2076
+ "variable_name": "Do you think the President of the United States should be able to determine if colleges and universities can enroll foreign students?",
2077
+ "response_label": "Refused",
2078
+ "pct": "0",
2079
+ "count": ""
2080
+ },
2081
+ "2025_June_148": {
2082
+ "topline_id": "2025_June_148",
2083
+ "poll_date": "2025-June",
2084
+ "year": "2025",
2085
+ "month": "June",
2086
+ "survey_name": "Vanderbilt Unity Poll",
2087
+ "variable_name": "Do you think the President of the United States should be able to determine if colleges and universities can enroll foreign students?",
2088
+ "response_label": "Don't know",
2089
+ "pct": "0",
2090
+ "count": ""
2091
+ },
2092
+ "2025_June_149": {
2093
+ "topline_id": "2025_June_149",
2094
+ "poll_date": "2025-June",
2095
+ "year": "2025",
2096
+ "month": "June",
2097
+ "survey_name": "Vanderbilt Unity Poll",
2098
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
2099
+ "response_label": "The president should have the authority to ignore court rulings he disagrees with",
2100
+ "pct": "17",
2101
+ "count": ""
2102
+ },
2103
+ "2025_June_150": {
2104
+ "topline_id": "2025_June_150",
2105
+ "poll_date": "2025-June",
2106
+ "year": "2025",
2107
+ "month": "June",
2108
+ "survey_name": "Vanderbilt Unity Poll",
2109
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
2110
+ "response_label": "The president should be required to follow court rulings even if he disagrees with them",
2111
+ "pct": "83",
2112
+ "count": ""
2113
+ },
2114
+ "2025_June_151": {
2115
+ "topline_id": "2025_June_151",
2116
+ "poll_date": "2025-June",
2117
+ "year": "2025",
2118
+ "month": "June",
2119
+ "survey_name": "Vanderbilt Unity Poll",
2120
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
2121
+ "response_label": "Don\u2019t know",
2122
+ "pct": "0",
2123
+ "count": ""
2124
+ },
2125
+ "2025_June_152": {
2126
+ "topline_id": "2025_June_152",
2127
+ "poll_date": "2025-June",
2128
+ "year": "2025",
2129
+ "month": "June",
2130
+ "survey_name": "Vanderbilt Unity Poll",
2131
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
2132
+ "response_label": "Refused",
2133
+ "pct": "0",
2134
+ "count": ""
2135
+ },
2136
+ "2025_June_153": {
2137
+ "topline_id": "2025_June_153",
2138
+ "poll_date": "2025-June",
2139
+ "year": "2025",
2140
+ "month": "June",
2141
+ "survey_name": "Vanderbilt Unity Poll",
2142
+ "variable_name": "Which of the following comes closer to your own views, even if neither is exactly right?",
2143
+ "response_label": "Don't know",
2144
+ "pct": "0",
2145
+ "count": ""
2146
+ },
2147
+ "2025_June_154": {
2148
+ "topline_id": "2025_June_154",
2149
+ "poll_date": "2025-June",
2150
+ "year": "2025",
2151
+ "month": "June",
2152
+ "survey_name": "Vanderbilt Unity Poll",
2153
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
2154
+ "response_label": "More of a supporter of the Make America Great Again or MAGA movement",
2155
+ "pct": "44",
2156
+ "count": ""
2157
+ },
2158
+ "2025_June_155": {
2159
+ "topline_id": "2025_June_155",
2160
+ "poll_date": "2025-June",
2161
+ "year": "2025",
2162
+ "month": "June",
2163
+ "survey_name": "Vanderbilt Unity Poll",
2164
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
2165
+ "response_label": "More of a supporter of the Republican Party",
2166
+ "pct": "56",
2167
+ "count": ""
2168
+ },
2169
+ "2025_June_156": {
2170
+ "topline_id": "2025_June_156",
2171
+ "poll_date": "2025-June",
2172
+ "year": "2025",
2173
+ "month": "June",
2174
+ "survey_name": "Vanderbilt Unity Poll",
2175
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
2176
+ "response_label": "Don't know",
2177
+ "pct": "0",
2178
+ "count": ""
2179
+ },
2180
+ "2025_June_157": {
2181
+ "topline_id": "2025_June_157",
2182
+ "poll_date": "2025-June",
2183
+ "year": "2025",
2184
+ "month": "June",
2185
+ "survey_name": "Vanderbilt Unity Poll",
2186
+ "variable_name": "When it comes to political matters, do you consider yourself to be:",
2187
+ "response_label": "Refused",
2188
+ "pct": "0",
2189
+ "count": ""
2190
+ },
2191
+ "2025_June_158": {
2192
+ "topline_id": "2025_June_158",
2193
+ "poll_date": "2025-June",
2194
+ "year": "2025",
2195
+ "month": "June",
2196
+ "survey_name": "Vanderbilt Unity Poll",
2197
+ "variable_name": "In talking to people about elections, we often find that a lot of people were not able to vote because they weren\u2019t registered, they were sick, or they just didn\u2019t have time. How about you, did you ha",
2198
+ "response_label": "Yes, did vote",
2199
+ "pct": "71",
2200
+ "count": ""
2201
+ },
2202
+ "2025_June_159": {
2203
+ "topline_id": "2025_June_159",
2204
+ "poll_date": "2025-June",
2205
+ "year": "2025",
2206
+ "month": "June",
2207
+ "survey_name": "Vanderbilt Unity Poll",
2208
+ "variable_name": "In talking to people about elections, we often find that a lot of people were not able to vote because they weren\u2019t registered, they were sick, or they just didn\u2019t have time. How about you, did you ha",
2209
+ "response_label": "No, did not vote",
2210
+ "pct": "29",
2211
+ "count": ""
2212
+ },
2213
+ "2025_June_160": {
2214
+ "topline_id": "2025_June_160",
2215
+ "poll_date": "2025-June",
2216
+ "year": "2025",
2217
+ "month": "June",
2218
+ "survey_name": "Vanderbilt Unity Poll",
2219
+ "variable_name": "In talking to people about elections, we often find that a lot of people were not able to vote because they weren\u2019t registered, they were sick, or they just didn\u2019t have time. How about you, did you ha",
2220
+ "response_label": "Don't know",
2221
+ "pct": "0",
2222
+ "count": ""
2223
+ },
2224
+ "2025_June_161": {
2225
+ "topline_id": "2025_June_161",
2226
+ "poll_date": "2025-June",
2227
+ "year": "2025",
2228
+ "month": "June",
2229
+ "survey_name": "Vanderbilt Unity Poll",
2230
+ "variable_name": "In talking to people about elections, we often find that a lot of people were not able to vote because they weren\u2019t registered, they were sick, or they just didn\u2019t have time. How about you, did you ha",
2231
+ "response_label": "Refused",
2232
+ "pct": "0",
2233
+ "count": ""
2234
+ },
2235
+ "2025_June_162": {
2236
+ "topline_id": "2025_June_162",
2237
+ "poll_date": "2025-June",
2238
+ "year": "2025",
2239
+ "month": "June",
2240
+ "survey_name": "Vanderbilt Unity Poll",
2241
+ "variable_name": "Which presidential candidate did you vote for in 2024?",
2242
+ "response_label": "I voted for Kamala Harris",
2243
+ "pct": "48",
2244
+ "count": ""
2245
+ },
2246
+ "2025_June_163": {
2247
+ "topline_id": "2025_June_163",
2248
+ "poll_date": "2025-June",
2249
+ "year": "2025",
2250
+ "month": "June",
2251
+ "survey_name": "Vanderbilt Unity Poll",
2252
+ "variable_name": "Which presidential candidate did you vote for in 2024?",
2253
+ "response_label": "I voted for Donald Trump",
2254
+ "pct": "45",
2255
+ "count": ""
2256
+ },
2257
+ "2025_June_164": {
2258
+ "topline_id": "2025_June_164",
2259
+ "poll_date": "2025-June",
2260
+ "year": "2025",
2261
+ "month": "June",
2262
+ "survey_name": "Vanderbilt Unity Poll",
2263
+ "variable_name": "Which presidential candidate did you vote for in 2024?",
2264
+ "response_label": "I voted for another candidate",
2265
+ "pct": "7",
2266
+ "count": ""
2267
+ },
2268
+ "2025_June_165": {
2269
+ "topline_id": "2025_June_165",
2270
+ "poll_date": "2025-June",
2271
+ "year": "2025",
2272
+ "month": "June",
2273
+ "survey_name": "Vanderbilt Unity Poll",
2274
+ "variable_name": "Which presidential candidate did you vote for in 2024?",
2275
+ "response_label": "Don't know",
2276
+ "pct": "0",
2277
+ "count": ""
2278
+ },
2279
+ "2025_June_166": {
2280
+ "topline_id": "2025_June_166",
2281
+ "poll_date": "2025-June",
2282
+ "year": "2025",
2283
+ "month": "June",
2284
+ "survey_name": "Vanderbilt Unity Poll",
2285
+ "variable_name": "Which presidential candidate did you vote for in 2024?",
2286
+ "response_label": "Refused",
2287
+ "pct": "0",
2288
+ "count": ""
2289
+ }
2290
+ }