Spaces:

raahinaez
/

doc

Runtime error

App Files Files Community

raahinaez commited on 11 days ago

Commit

3486df4

verified ·

1 Parent(s): cfb98af

Update app.py

Browse files

Files changed (1) hide show

app.py +43 -12

app.py CHANGED Viewed

@@ -1,16 +1,25 @@
 import os
 import gradio as gr
 import pdfplumber
-from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
 model_name = "ibm-granite/granite-docling-258M"
-# Use HF token stored in Space secrets
-hf_token = os.environ.get("HF_HUB_TOKEN")
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-model = AutoModelForSeq2SeqLM.from_pretrained(model_name, token=hf_token)
 def extract_text_from_pdf(pdf_file):
     text = ""
     with pdfplumber.open(pdf_file.name) as pdf:
@@ -20,19 +29,41 @@ def extract_text_from_pdf(pdf_file):
                 text += page_text + "\n"
     return text
 def pdf_to_json(pdf_file):
     text = extract_text_from_pdf(pdf_file)
-    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048)
-    outputs = model.generate(**inputs, max_new_tokens=1024)
-    result = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return result
 interface = gr.Interface(
     fn=pdf_to_json,
     inputs=gr.File(file_types=[".pdf"]),
     outputs=gr.Textbox(label="Generated JSON"),
-    title="PDF to JSON using Granite DocLing",
-    description="Upload a PDF and get a JSON output using ibm-granite/granite-docling-258M."
 )
-interface.launch()

 import os
 import gradio as gr
 import pdfplumber
+import torch
+from transformers import AutoTokenizer, AutoModel
+# -------------------------
+# Settings
+# -------------------------
 model_name = "ibm-granite/granite-docling-258M"
+hf_token = os.environ.get("HF_HUB_TOKEN")  # Set this in Space secrets
+# -------------------------
+# Load tokenizer and model
+# -------------------------
 tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
+model = AutoModel.from_pretrained(model_name, token=hf_token)
+model.eval()  # inference mode
+# -------------------------
+# PDF → Text
+# -------------------------
 def extract_text_from_pdf(pdf_file):
     text = ""
     with pdfplumber.open(pdf_file.name) as pdf:
                 text += page_text + "\n"
     return text
+# -------------------------
+# Text → JSON using Granite
+# -------------------------
 def pdf_to_json(pdf_file):
     text = extract_text_from_pdf(pdf_file)
+    # Chunk text if too long
+    max_length = 2048
+    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
+    results = []
+    for chunk in chunks:
+        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
+        with torch.no_grad():
+            outputs = model(**inputs)
+        # Decode tokens back to string
+        decoded = tokenizer.batch_decode(outputs.last_hidden_state.argmax(-1), skip_special_tokens=True)
+        results.append("".join(decoded))
+    # Combine all chunks
+    combined_result = "\n".join(results)
+    return combined_result
+# -------------------------
+# Gradio Interface
+# -------------------------
 interface = gr.Interface(
     fn=pdf_to_json,
     inputs=gr.File(file_types=[".pdf"]),
     outputs=gr.Textbox(label="Generated JSON"),
+    title="PDF to JSON with Granite DocLing",
+    description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
 )
+if __name__ == "__main__":
+    interface.launch()