Spaces:

raahinaez
/

doc

Runtime error

App Files Files Community

raahinaez commited on 11 days ago

Commit

eaa0c7a

verified ·

1 Parent(s): f017143

Update app.py

Browse files

Files changed (1) hide show

app.py +18 -49

app.py CHANGED Viewed

@@ -1,58 +1,27 @@
-import os
 import gradio as gr
-import pdfplumber
-import torch
-from transformers import GraniteTokenizer, GraniteForDocumentParsing
-# -------------------------
-# Settings
-# -------------------------
-model_name = "ibm-granite/granite-docling-258M"
-hf_token = os.environ.get("HF_HUB_TOKEN")  # Set in Space secrets
-# -------------------------
-# Load tokenizer and model
-# -------------------------
-tokenizer = GraniteTokenizer.from_pretrained(model_name, token=hf_token)
-model = GraniteForDocumentParsing.from_pretrained(model_name, token=hf_token)
-model.eval()
-# -------------------------
-# PDF → Text
-# -------------------------
-def extract_text_from_pdf(pdf_file):
-    text = ""
-    with pdfplumber.open(pdf_file.name) as pdf:
-        for page in pdf.pages:
-            page_text = page.extract_text()
-            if page_text:
-                text += page_text + "\n"
-    return text
-# -------------------------
-# PDF → JSON
-# -------------------------
-def pdf_to_json(pdf_file):
-    text = extract_text_from_pdf(pdf_file)
-    # Tokenize and process
-    inputs = tokenizer(text, return_tensors="pt", truncation=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-    # This returns the structured JSON output
-    parsed_json = model.decode(outputs)  # Granite's built-in decode method
-    return parsed_json
-# -------------------------
-# Gradio interface
-# -------------------------
 interface = gr.Interface(
-    fn=pdf_to_json,
     inputs=gr.File(file_types=[".pdf"]),
-    outputs=gr.JSON(label="Parsed JSON"),
-    title="PDF → JSON with Granite DocLing",
-    description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
 )
 if __name__ == "__main__":

+from docling.document_converter import DocumentConverter
+from docling.datamodel.base_models import InputFormat
+from docling.datamodel.pipeline_options import VlmPipelineOptions
+from docling.pipeline.vlm_pipeline import VlmPipeline
 import gradio as gr
+import os
+# Optionally set HF token as secret, but docling will handle download
+os.environ["HF_HUB_TOKEN"] = os.environ.get("HF_HUB_TOKEN", "")
+def pdf_to_markdown(pdf_file):
+    converter = DocumentConverter(
+        format_options={
+            InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline,
+                                             pipeline_options=VlmPipelineOptions(vlm_model_specs="granite_docling"))
+        }
+    )
+    doc = converter.convert(source=pdf_file.name).document
+    return doc.export_to_markdown()
 interface = gr.Interface(
+    fn=pdf_to_markdown,
     inputs=gr.File(file_types=[".pdf"]),
+    outputs="text"
 )
 if __name__ == "__main__":