Spaces:

raahinaez
/

doc

Runtime error

App Files Files Community

raahinaez commited on 11 days ago

Commit

f017143

verified ·

1 Parent(s): d1556c5

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -28

app.py CHANGED Viewed

@@ -2,20 +2,20 @@ import os
 import gradio as gr
 import pdfplumber
 import torch
-from transformers import AutoTokenizer, AutoModel
 # -------------------------
 # Settings
 # -------------------------
 model_name = "ibm-granite/granite-docling-258M"
-hf_token = os.environ.get("HF_HUB_TOKEN")  # Set this in Space secrets
 # -------------------------
 # Load tokenizer and model
 # -------------------------
-tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
-model = AutoModel.from_pretrained(model_name, token=hf_token)
-model.eval()  # inference mode
 # -------------------------
 # PDF → Text
@@ -30,38 +30,28 @@ def extract_text_from_pdf(pdf_file):
     return text
 # -------------------------
-# Text → JSON using Granite
 # -------------------------
 def pdf_to_json(pdf_file):
     text = extract_text_from_pdf(pdf_file)
-    # Chunk text if too long
-    max_length = 2048
-    chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
-    results = []
-    for chunk in chunks:
-        inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
-        with torch.no_grad():
-            outputs = model(**inputs)
-        # Decode tokens back to string
-        decoded = tokenizer.batch_decode(outputs.last_hidden_state.argmax(-1), skip_special_tokens=True)
-        results.append("".join(decoded))
-    # Combine all chunks
-    combined_result = "\n".join(results)
-    return combined_result
 # -------------------------
-# Gradio Interface
 # -------------------------
 interface = gr.Interface(
     fn=pdf_to_json,
     inputs=gr.File(file_types=[".pdf"]),
-    outputs=gr.Textbox(label="Generated JSON"),
-    title="PDF to JSON with Granite DocLing",
     description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
 )

 import gradio as gr
 import pdfplumber
 import torch
+from transformers import GraniteTokenizer, GraniteForDocumentParsing
 # -------------------------
 # Settings
 # -------------------------
 model_name = "ibm-granite/granite-docling-258M"
+hf_token = os.environ.get("HF_HUB_TOKEN")  # Set in Space secrets
 # -------------------------
 # Load tokenizer and model
 # -------------------------
+tokenizer = GraniteTokenizer.from_pretrained(model_name, token=hf_token)
+model = GraniteForDocumentParsing.from_pretrained(model_name, token=hf_token)
+model.eval()
 # -------------------------
 # PDF → Text
     return text
 # -------------------------
+# PDF → JSON
 # -------------------------
 def pdf_to_json(pdf_file):
     text = extract_text_from_pdf(pdf_file)
+    # Tokenize and process
+    inputs = tokenizer(text, return_tensors="pt", truncation=True)
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # This returns the structured JSON output
+    parsed_json = model.decode(outputs)  # Granite's built-in decode method
+    return parsed_json
 # -------------------------
+# Gradio interface
 # -------------------------
 interface = gr.Interface(
     fn=pdf_to_json,
     inputs=gr.File(file_types=[".pdf"]),
+    outputs=gr.JSON(label="Parsed JSON"),
+    title="PDF → JSON with Granite DocLing",
     description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
 )