Spaces:

HIEHEU
/

Vietnamese_Text_Summarizer

Running

App Files Files Community

Vietnamese_Text_Summarizer / py /extractive.py

HIEHEU

Upload 10 files

5ed11a1 verified 5 days ago

raw

history blame contribute delete

4.39 kB

	# py/extractive.py
	# Extractive summarizer module adapted for deployment (TextRank + TF-IDF)
	import re
	import os
	import random
	import networkx as nx
	import numpy as np
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	try:
	import py_vncorenlp
	_VNCORENLP_AVAILABLE = True
	except Exception:
	_VNCORENLP_AVAILABLE = False

	# Optional path for stopwords; if absent we'll use None
	DEFAULT_STOPWORDS_PATH = os.path.join("data", "Vietnamese-stopwords.txt")


	def load_stopwords(path=None):
	p = path or DEFAULT_STOPWORDS_PATH
	if os.path.exists(p):
	try:
	with open(p, "r", encoding="utf-8") as f:
	return [m.strip().lower() for m in f.readlines() if m.strip()]
	except Exception:
	return None
	return None


	def clean_text_basic(text):
	if not text:
	return ""
	# Remove captions like "Ảnh: ..." "Video: ..."
	text = re.sub(r'^.(Ảnh\|Video\|Clip\|Nguồn)\s:[^(\n)].$', '', text, flags=re.MULTILINE \| re.IGNORECASE)
	text = re.sub(r'$?\[?(Ảnh\|Video\|Clip)\s*:[^)]+[$\]]?', '', text, flags=re.IGNORECASE)
	text = re.sub(r'\n\s*[A-ZÁÀẢÃẠÂĂÊÔƠƯĐ\s\.]{2,50}$', '', text.strip(), flags=re.MULTILINE)
	text = re.sub(r'[ \t]+', ' ', text)
	return text.strip()


	def split_sentences_fallback(text):
	sentences = re.split(r'(?<=[.!?])\s+', text)
	return [s.strip() for s in sentences if len(s.strip()) > 15]


	class ExtractiveSummarizer:
	def __init__(self, stopwords_path=None):
	self.stopwords = load_stopwords(stopwords_path)

	# If vncorenlp is available, try to initialize (may require model download previously)
	self.rdrsegmenter = None
	if _VNCORENLP_AVAILABLE:
	try:
	# Use default model dir if available in environment; if not, py_vncorenlp will try to download
	self.rdrsegmenter = py_vncorenlp.VnCoreNLP()
	except Exception:
	self.rdrsegmenter = None

	def process_text_vncorenlp(self, text):
	text = clean_text_basic(text)
	paragraphs = [p.strip() for p in text.replace('\r\n', '\n').replace('\r', '\n').split('\n') if p.strip()]
	processed_sentences = []
	if self.rdrsegmenter:
	try:
	for p in paragraphs:
	sentences = self.rdrsegmenter.word_segment(p)
	for s in sentences:
	raw_sent = s.replace("_", " ")
	if len(raw_sent) > 15 and any(c.isalpha() for c in raw_sent):
	processed_sentences.append({'raw': raw_sent, 'segmented': s})
	except Exception:
	processed_sentences = []

	if not processed_sentences:
	# fallback
	raw_sents = split_sentences_fallback(text)
	processed_sentences = [{'raw': s, 'segmented': s} for s in raw_sents]

	return processed_sentences

	def summarize(self, text, top_n=3):
	if not text or not text.strip():
	return ""
	sent_data = self.process_text_vncorenlp(text)
	if not sent_data:
	return ""
	if len(sent_data) <= top_n:
	# If text too short, return original cleaned text
	return " ".join([s['raw'] for s in sent_data])

	corpus = [item['segmented'] for item in sent_data]
	try:
	vectorizer = TfidfVectorizer(stop_words=self.stopwords)
	tfidf_matrix = vectorizer.fit_transform(corpus)
	except Exception:
	return sent_data[0]['raw'] if sent_data else text

	similarity_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
	nx_graph = nx.from_numpy_array(similarity_matrix)
	try:
	scores = nx.pagerank(nx_graph, max_iter=500)
	except Exception:
	scores = {i: 1 for i in range(len(sent_data))}

	ranked_sentences = sorted(((scores[i], sent_data[i]['raw'], i)
	for i in range(len(sent_data))), reverse=True)
	selected_sentences = ranked_sentences[:top_n]
	selected_sentences.sort(key=lambda x: x[2])
	summary = " ".join([s for _, s, _ in selected_sentences])
	return summary