raahinaez commited on
Commit
f017143
Β·
verified Β·
1 Parent(s): d1556c5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +18 -28
app.py CHANGED
@@ -2,20 +2,20 @@ import os
2
  import gradio as gr
3
  import pdfplumber
4
  import torch
5
- from transformers import AutoTokenizer, AutoModel
6
 
7
  # -------------------------
8
  # Settings
9
  # -------------------------
10
  model_name = "ibm-granite/granite-docling-258M"
11
- hf_token = os.environ.get("HF_HUB_TOKEN") # Set this in Space secrets
12
 
13
  # -------------------------
14
  # Load tokenizer and model
15
  # -------------------------
16
- tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
17
- model = AutoModel.from_pretrained(model_name, token=hf_token)
18
- model.eval() # inference mode
19
 
20
  # -------------------------
21
  # PDF β†’ Text
@@ -30,38 +30,28 @@ def extract_text_from_pdf(pdf_file):
30
  return text
31
 
32
  # -------------------------
33
- # Text β†’ JSON using Granite
34
  # -------------------------
35
  def pdf_to_json(pdf_file):
36
  text = extract_text_from_pdf(pdf_file)
37
-
38
- # Chunk text if too long
39
- max_length = 2048
40
- chunks = [text[i:i+max_length] for i in range(0, len(text), max_length)]
41
-
42
- results = []
43
- for chunk in chunks:
44
- inputs = tokenizer(chunk, return_tensors="pt", truncation=True, max_length=max_length)
45
-
46
- with torch.no_grad():
47
- outputs = model(**inputs)
48
-
49
- # Decode tokens back to string
50
- decoded = tokenizer.batch_decode(outputs.last_hidden_state.argmax(-1), skip_special_tokens=True)
51
- results.append("".join(decoded))
52
-
53
- # Combine all chunks
54
- combined_result = "\n".join(results)
55
- return combined_result
56
 
57
  # -------------------------
58
- # Gradio Interface
59
  # -------------------------
60
  interface = gr.Interface(
61
  fn=pdf_to_json,
62
  inputs=gr.File(file_types=[".pdf"]),
63
- outputs=gr.Textbox(label="Generated JSON"),
64
- title="PDF to JSON with Granite DocLing",
65
  description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
66
  )
67
 
 
2
  import gradio as gr
3
  import pdfplumber
4
  import torch
5
+ from transformers import GraniteTokenizer, GraniteForDocumentParsing
6
 
7
  # -------------------------
8
  # Settings
9
  # -------------------------
10
  model_name = "ibm-granite/granite-docling-258M"
11
+ hf_token = os.environ.get("HF_HUB_TOKEN") # Set in Space secrets
12
 
13
  # -------------------------
14
  # Load tokenizer and model
15
  # -------------------------
16
+ tokenizer = GraniteTokenizer.from_pretrained(model_name, token=hf_token)
17
+ model = GraniteForDocumentParsing.from_pretrained(model_name, token=hf_token)
18
+ model.eval()
19
 
20
  # -------------------------
21
  # PDF β†’ Text
 
30
  return text
31
 
32
  # -------------------------
33
+ # PDF β†’ JSON
34
  # -------------------------
35
  def pdf_to_json(pdf_file):
36
  text = extract_text_from_pdf(pdf_file)
37
+
38
+ # Tokenize and process
39
+ inputs = tokenizer(text, return_tensors="pt", truncation=True)
40
+ with torch.no_grad():
41
+ outputs = model(**inputs)
42
+
43
+ # This returns the structured JSON output
44
+ parsed_json = model.decode(outputs) # Granite's built-in decode method
45
+ return parsed_json
 
 
 
 
 
 
 
 
 
 
46
 
47
  # -------------------------
48
+ # Gradio interface
49
  # -------------------------
50
  interface = gr.Interface(
51
  fn=pdf_to_json,
52
  inputs=gr.File(file_types=[".pdf"]),
53
+ outputs=gr.JSON(label="Parsed JSON"),
54
+ title="PDF β†’ JSON with Granite DocLing",
55
  description="Upload a PDF and get structured JSON output using ibm-granite/granite-docling-258M."
56
  )
57