raahinaez commited on
Commit
eaa0c7a
Β·
verified Β·
1 Parent(s): f017143

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -49
app.py CHANGED
@@ -1,58 +1,27 @@
1
- import os
 
 
 
2
  import gradio as gr
3
- import pdfplumber
4
- import torch
5
- from transformers import GraniteTokenizer, GraniteForDocumentParsing
6
-
7
- # -------------------------
8
- # Settings
9
- # -------------------------
10
- model_name = "ibm-granite/granite-docling-258M"
11
- hf_token = os.environ.get("HF_HUB_TOKEN") # Set in Space secrets
12
-
13
- # -------------------------
14
- # Load tokenizer and model
15
- # -------------------------
16
- tokenizer = GraniteTokenizer.from_pretrained(model_name, token=hf_token)
17
- model = GraniteForDocumentParsing.from_pretrained(model_name, token=hf_token)
18
- model.eval()
19
-
20
- # -------------------------
21
- # PDF β†’ Text
22
- # -------------------------
23
- def extract_text_from_pdf(pdf_file):
24
- text = ""
25
- with pdfplumber.open(pdf_file.name) as pdf:
26
- for page in pdf.pages:
27
- page_text = page.extract_text()
28
- if page_text:
29
- text += page_text + "\n"
30
- return text
31
-
32
- # -------------------------
33
- # PDF β†’ JSON
34
- # -------------------------
35
- def pdf_to_json(pdf_file):
36
- text = extract_text_from_pdf(pdf_file)
37
 
38
- # Tokenize and process
39
- inputs = tokenizer(text, return_tensors="pt", truncation=True)
40
- with torch.no_grad():
41
- outputs = model(**inputs)
42
 
43
- # This returns the structured JSON output
44
- parsed_json = model.decode(outputs) # Granite's built-in decode method
45
- return parsed_json
 
 
 
 
 
 
46
 
47
- # -------------------------
48
- # Gradio interface
49
- # -------------------------
50
  interface = gr.Interface(
51
- fn=pdf_to_json,
52
  inputs=gr.File(file_types=[".pdf"]),
53
- outputs=gr.JSON(label="Parsed JSON"),
54
- title="PDF β†’ JSON with Granite DocLing",
55
- description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
56
  )
57
 
58
  if __name__ == "__main__":
 
1
+ from docling.document_converter import DocumentConverter
2
+ from docling.datamodel.base_models import InputFormat
3
+ from docling.datamodel.pipeline_options import VlmPipelineOptions
4
+ from docling.pipeline.vlm_pipeline import VlmPipeline
5
  import gradio as gr
6
+ import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
+ # Optionally set HF token as secret, but docling will handle download
9
+ os.environ["HF_HUB_TOKEN"] = os.environ.get("HF_HUB_TOKEN", "")
 
 
10
 
11
+ def pdf_to_markdown(pdf_file):
12
+ converter = DocumentConverter(
13
+ format_options={
14
+ InputFormat.PDF: PdfFormatOption(pipeline_cls=VlmPipeline,
15
+ pipeline_options=VlmPipelineOptions(vlm_model_specs="granite_docling"))
16
+ }
17
+ )
18
+ doc = converter.convert(source=pdf_file.name).document
19
+ return doc.export_to_markdown()
20
 
 
 
 
21
  interface = gr.Interface(
22
+ fn=pdf_to_markdown,
23
  inputs=gr.File(file_types=[".pdf"]),
24
+ outputs="text"
 
 
25
  )
26
 
27
  if __name__ == "__main__":