|
|
|
|
|
import gradio as gr |
|
|
import pdfplumber |
|
|
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
|
|
|
|
model_name = "ibm-granite/granite-docling-258m-demo" |
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
|
model = AutoModelForSeq2SeqLM.from_pretrained(model_name) |
|
|
|
|
|
|
|
|
def extract_text_from_pdf(pdf_file): |
|
|
text = "" |
|
|
with pdfplumber.open(pdf_file.name) as pdf: |
|
|
for page in pdf.pages: |
|
|
page_text = page.extract_text() |
|
|
if page_text: |
|
|
text += page_text + "\n" |
|
|
return text |
|
|
|
|
|
|
|
|
def pdf_to_json(pdf_file): |
|
|
text = extract_text_from_pdf(pdf_file) |
|
|
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=2048) |
|
|
outputs = model.generate(**inputs, max_new_tokens=1024) |
|
|
result = tokenizer.decode(outputs[0], skip_special_tokens=True) |
|
|
return result |
|
|
|
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=pdf_to_json, |
|
|
inputs=gr.File(file_types=[".pdf"]), |
|
|
outputs=gr.Textbox(label="Generated JSON"), |
|
|
title="PDF to JSON using Granite DocLing", |
|
|
description="Upload a PDF and get a JSON output using the ibm-granite/granite-docling-258m-demo model." |
|
|
) |
|
|
|
|
|
interface.launch() |
|
|
|