Spaces:
Build error
Build error
Update app.py
Browse filesupdated the code to include FAISS and a transformer
app.py
CHANGED
|
@@ -2,6 +2,10 @@ import gradio as gr
|
|
| 2 |
import os
|
| 3 |
import pdfplumber
|
| 4 |
import together
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
import re
|
| 6 |
import unicodedata
|
| 7 |
from dotenv import load_dotenv
|
|
@@ -11,6 +15,40 @@ load_dotenv()
|
|
| 11 |
# Set up Together.AI API Key (Replace with your actual key)
|
| 12 |
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
def clean_text(text):
|
| 15 |
"""Cleans extracted text for better processing by the model."""
|
| 16 |
print("cleaning")
|
|
@@ -26,6 +64,7 @@ def extract_text_from_pdf(pdf_file):
|
|
| 26 |
try:
|
| 27 |
with pdfplumber.open(pdf_file) as pdf:
|
| 28 |
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
|
|
|
|
| 29 |
return text
|
| 30 |
except Exception as e:
|
| 31 |
print(f"Error extracting text: {e}")
|
|
@@ -44,9 +83,12 @@ def chatbot(pdf_file, user_question):
|
|
| 44 |
text = extract_text_from_pdf(pdf_file)
|
| 45 |
if not text:
|
| 46 |
return "Could not extract any text from the PDF."
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
| 48 |
# Split into smaller chunks
|
| 49 |
-
chunks = split_text(
|
| 50 |
|
| 51 |
# Use only the first chunk (to optimize token usage)
|
| 52 |
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
|
|
|
|
| 2 |
import os
|
| 3 |
import pdfplumber
|
| 4 |
import together
|
| 5 |
+
from transformers import pipeline
|
| 6 |
+
from sentence_transformers import SentenceTransformer
|
| 7 |
+
import faiss
|
| 8 |
+
import numpy as np
|
| 9 |
import re
|
| 10 |
import unicodedata
|
| 11 |
from dotenv import load_dotenv
|
|
|
|
| 15 |
# Set up Together.AI API Key (Replace with your actual key)
|
| 16 |
assert os.getenv("TOGETHER_API_KEY"), "api key missing"
|
| 17 |
|
| 18 |
+
|
| 19 |
+
# Load LLaMA-2 Model
|
| 20 |
+
llama_pipe = pipeline("text-generation", model="meta-llama/Llama-2-7b-chat-hf")
|
| 21 |
+
|
| 22 |
+
# Load Sentence Transformer for embeddings
|
| 23 |
+
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")
|
| 24 |
+
|
| 25 |
+
# Initialize FAISS index
|
| 26 |
+
embedding_dim = 384 # For MiniLM model
|
| 27 |
+
index = faiss.IndexFlatL2(embedding_dim)
|
| 28 |
+
documents = [] # Store raw text for reference
|
| 29 |
+
|
| 30 |
+
def store_document(text):
|
| 31 |
+
print("storing document")
|
| 32 |
+
|
| 33 |
+
embedding = embedding_model.encode([text])
|
| 34 |
+
index.add(np.array(embedding, dtype=np.float32))
|
| 35 |
+
documents.append(text)
|
| 36 |
+
|
| 37 |
+
print(f"your document has been stored: \n{documents}")
|
| 38 |
+
|
| 39 |
+
return "Document stored!"
|
| 40 |
+
|
| 41 |
+
def retrieve_document(query):
|
| 42 |
+
print(f"retrieving doc based on {query}")
|
| 43 |
+
|
| 44 |
+
query_embedding = embedding_model.encode([query])
|
| 45 |
+
_, closest_idx = index.search(np.array(query_embedding, dtype=np.float32), 1)
|
| 46 |
+
|
| 47 |
+
print(f"retrieved: {documents[closest_idx[0][0]}")
|
| 48 |
+
|
| 49 |
+
return documents[closest_idx[0][0]]
|
| 50 |
+
|
| 51 |
+
|
| 52 |
def clean_text(text):
|
| 53 |
"""Cleans extracted text for better processing by the model."""
|
| 54 |
print("cleaning")
|
|
|
|
| 64 |
try:
|
| 65 |
with pdfplumber.open(pdf_file) as pdf:
|
| 66 |
text = " ".join(clean_text(text) for page in pdf.pages if (text := page.extract_text()))
|
| 67 |
+
store_document(text)
|
| 68 |
return text
|
| 69 |
except Exception as e:
|
| 70 |
print(f"Error extracting text: {e}")
|
|
|
|
| 83 |
text = extract_text_from_pdf(pdf_file)
|
| 84 |
if not text:
|
| 85 |
return "Could not extract any text from the PDF."
|
| 86 |
+
|
| 87 |
+
# retrieve the document relevant to the query
|
| 88 |
+
doc = retrieve_document(user_question)
|
| 89 |
+
|
| 90 |
# Split into smaller chunks
|
| 91 |
+
chunks = split_text(doc)
|
| 92 |
|
| 93 |
# Use only the first chunk (to optimize token usage)
|
| 94 |
prompt = f"Based on this document, answer the question:\n\nDocument:\n{chunks[0]}\n\nQuestion: {user_question}"
|