Update py/abstractive.py
Browse files- py/abstractive.py +64 -70
py/abstractive.py
CHANGED
|
@@ -5,8 +5,8 @@ import re
|
|
| 5 |
import pickle
|
| 6 |
import unicodedata
|
| 7 |
import numpy as np
|
| 8 |
-
import
|
| 9 |
-
from transformers import AutoTokenizer, AutoModel
|
| 10 |
|
| 11 |
# -------------------------------------------------------------------
|
| 12 |
# CONFIG
|
|
@@ -21,7 +21,7 @@ CONFIG = {
|
|
| 21 |
"FF_DIM": 2048,
|
| 22 |
"DROPOUT": 0.2,
|
| 23 |
"TOKENIZER_FILE": "decoder_tokenizer_re.pkl",
|
| 24 |
-
"WEIGHTS_FILE": "
|
| 25 |
}
|
| 26 |
|
| 27 |
# -------------------------------------------------------------------
|
|
@@ -38,81 +38,61 @@ def clean_text_inference(text: str) -> str:
|
|
| 38 |
return text
|
| 39 |
|
| 40 |
# -------------------------------------------------------------------
|
| 41 |
-
# PHOBERT ENCODER (
|
| 42 |
# -------------------------------------------------------------------
|
| 43 |
-
class
|
| 44 |
def __init__(self):
|
| 45 |
-
print(">>> Loading PhoBERT (
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
# === FIX: Thêm low_cpu_mem_usage=False ===
|
| 49 |
-
# Lỗi "meta tensor" xảy ra do thư viện accelerate cố gắng tối ưu RAM bằng cách tạo model rỗng.
|
| 50 |
-
# low_cpu_mem_usage=False buộc model load đầy đủ weights vào RAM ngay lập tức.
|
| 51 |
-
try:
|
| 52 |
-
self.model = AutoModel.from_pretrained("vinai/phobert-base", low_cpu_mem_usage=False).to(self.device)
|
| 53 |
-
except TypeError:
|
| 54 |
-
# Fallback nếu phiên bản transformers cũ không hỗ trợ tham số này
|
| 55 |
-
self.model = AutoModel.from_pretrained("vinai/phobert-base").to(self.device)
|
| 56 |
-
|
| 57 |
-
self.model.eval()
|
| 58 |
print(">>> PhoBERT loaded successfully.")
|
| 59 |
|
| 60 |
def encode(self, input_ids, attention_mask):
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
return outputs.last_hidden_state.cpu().numpy()
|
| 66 |
-
|
| 67 |
-
# -------------------------------------------------------------------
|
| 68 |
-
# SAFE IMPORT TENSORFLOW
|
| 69 |
-
# -------------------------------------------------------------------
|
| 70 |
-
TF_AVAILABLE = True
|
| 71 |
-
try:
|
| 72 |
-
import tensorflow as tf
|
| 73 |
-
except Exception as e:
|
| 74 |
-
TF_AVAILABLE = False
|
| 75 |
-
_TF_ERR = e
|
| 76 |
-
tf = None
|
| 77 |
|
| 78 |
# -------------------------------------------------------------------
|
| 79 |
# DECODER TRANSFORMER (TensorFlow)
|
| 80 |
# -------------------------------------------------------------------
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
return self.ln3(out2 + self.drop3(ffn_out, training=training))
|
| 105 |
|
| 106 |
# -------------------------------------------------------------------
|
| 107 |
-
# BUILD
|
| 108 |
# -------------------------------------------------------------------
|
| 109 |
def build_inference_model():
|
|
|
|
| 110 |
enc_raw_input = tf.keras.Input(shape=(None, 768), name='enc_raw_input')
|
| 111 |
dec_inputs_inf = tf.keras.Input(shape=(None,), dtype=tf.int32, name='dec_inputs_inf')
|
| 112 |
|
|
|
|
| 113 |
enc_out = tf.keras.layers.Dense(CONFIG["EMBED_DIM"], activation="linear", name="encoder_projection")(enc_raw_input)
|
| 114 |
enc_out = tf.keras.layers.Dropout(CONFIG["DROPOUT"], name="encoder_dropout")(enc_out)
|
| 115 |
|
|
|
|
| 116 |
dec_token_emb = tf.keras.layers.Embedding(CONFIG["MAX_VOCAB_OUT"], CONFIG["EMBED_DIM"], mask_zero=True, name='dec_token_emb')
|
| 117 |
dec_pos_emb = tf.keras.layers.Embedding(CONFIG["MAX_SUMMARY_LEN"], CONFIG["EMBED_DIM"], name='dec_pos_emb')
|
| 118 |
|
|
@@ -126,6 +106,7 @@ def build_inference_model():
|
|
| 126 |
|
| 127 |
dec_emb_inf = tf.keras.layers.Lambda(add_pos_emb_inf, name='dec_emb_plus_pos_inf')(dec_inputs_inf)
|
| 128 |
|
|
|
|
| 129 |
dec_out = dec_emb_inf
|
| 130 |
for i in range(CONFIG["NUM_LAYERS"]):
|
| 131 |
block = TransformerDecoderBlock(
|
|
@@ -137,6 +118,7 @@ def build_inference_model():
|
|
| 137 |
)
|
| 138 |
dec_out = block(dec_out, enc_out, training=False)
|
| 139 |
|
|
|
|
| 140 |
outputs_inf = tf.keras.layers.Dense(CONFIG["MAX_VOCAB_OUT"], activation='softmax', name='output_dense')(dec_out)
|
| 141 |
|
| 142 |
model = tf.keras.Model(inputs=[enc_raw_input, dec_inputs_inf], outputs=outputs_inf, name="inference_decoder_export")
|
|
@@ -147,12 +129,10 @@ def build_inference_model():
|
|
| 147 |
# -------------------------------------------------------------------
|
| 148 |
class AbstractiveSummarizer:
|
| 149 |
def __init__(self, model_dir="./models"):
|
| 150 |
-
if not TF_AVAILABLE:
|
| 151 |
-
raise RuntimeError(f"TensorFlow không khả dụng: {_TF_ERR}")
|
| 152 |
-
|
| 153 |
self.model_dir = model_dir
|
| 154 |
|
| 155 |
-
|
|
|
|
| 156 |
self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
|
| 157 |
|
| 158 |
self._load_resources()
|
|
@@ -174,8 +154,12 @@ class AbstractiveSummarizer:
|
|
| 174 |
|
| 175 |
print(f"📥 Loading weights from {weights_path}...")
|
| 176 |
try:
|
| 177 |
-
|
| 178 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 179 |
except Exception as e:
|
| 180 |
print(f"❌ Error loading weights: {e}")
|
| 181 |
|
|
@@ -196,6 +180,7 @@ class AbstractiveSummarizer:
|
|
| 196 |
continue
|
| 197 |
|
| 198 |
dec_inp = np.array([seq])
|
|
|
|
| 199 |
preds = self.decoder_model.predict([enc_out, dec_inp], verbose=0)
|
| 200 |
last_token_probs = preds[0, -1, :]
|
| 201 |
|
|
@@ -216,17 +201,26 @@ class AbstractiveSummarizer:
|
|
| 216 |
return summary
|
| 217 |
|
| 218 |
def summarize_debug(self, text, k=3):
|
|
|
|
| 219 |
text_clean = clean_text_inference(text)
|
|
|
|
|
|
|
| 220 |
inp = self.phobert_tokenizer(
|
| 221 |
[text_clean],
|
| 222 |
max_length=CONFIG["MAX_TEXT_LEN"],
|
| 223 |
truncation=True, padding='max_length',
|
| 224 |
-
return_tensors='np'
|
| 225 |
)
|
| 226 |
-
enc_out = self.phobert.encode(inp['input_ids'], inp['attention_mask'])
|
| 227 |
-
seq = self.beam_search(enc_out, k=k)
|
| 228 |
|
| 229 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
summary = decoded_text.replace('startseq', '').replace('endseq', '').strip()
|
| 231 |
|
| 232 |
-
return summary,
|
|
|
|
| 5 |
import pickle
|
| 6 |
import unicodedata
|
| 7 |
import numpy as np
|
| 8 |
+
import tensorflow as tf # Dùng TF hoàn toàn
|
| 9 |
+
from transformers import AutoTokenizer, TFAutoModel # Dùng TFAutoModel thay vì AutoModel
|
| 10 |
|
| 11 |
# -------------------------------------------------------------------
|
| 12 |
# CONFIG
|
|
|
|
| 21 |
"FF_DIM": 2048,
|
| 22 |
"DROPOUT": 0.2,
|
| 23 |
"TOKENIZER_FILE": "decoder_tokenizer_re.pkl",
|
| 24 |
+
"WEIGHTS_FILE": "best_model.weights.h5" # Đã đổi tên khớp với file train của bạn
|
| 25 |
}
|
| 26 |
|
| 27 |
# -------------------------------------------------------------------
|
|
|
|
| 38 |
return text
|
| 39 |
|
| 40 |
# -------------------------------------------------------------------
|
| 41 |
+
# PHOBERT ENCODER (TENSORFLOW VERSION)
|
| 42 |
# -------------------------------------------------------------------
|
| 43 |
+
class PhoBERTEncoderTF:
|
| 44 |
def __init__(self):
|
| 45 |
+
print(">>> Loading PhoBERT (TensorFlow)...")
|
| 46 |
+
# Load đúng model TF giống lúc train
|
| 47 |
+
self.model = TFAutoModel.from_pretrained("vinai/phobert-base")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
print(">>> PhoBERT loaded successfully.")
|
| 49 |
|
| 50 |
def encode(self, input_ids, attention_mask):
|
| 51 |
+
# Input là numpy array, TFAutoModel nhận trực tiếp
|
| 52 |
+
outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
|
| 53 |
+
# Trả về last_hidden_state dạng numpy hoặc tensor
|
| 54 |
+
return outputs.last_hidden_state
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
|
| 56 |
# -------------------------------------------------------------------
|
| 57 |
# DECODER TRANSFORMER (TensorFlow)
|
| 58 |
# -------------------------------------------------------------------
|
| 59 |
+
class TransformerDecoderBlock(tf.keras.layers.Layer):
|
| 60 |
+
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
|
| 61 |
+
super().__init__(**kwargs)
|
| 62 |
+
self.att1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
|
| 63 |
+
self.att2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
|
| 64 |
+
self.ffn = tf.keras.Sequential([
|
| 65 |
+
tf.keras.layers.Dense(ff_dim, activation="relu"),
|
| 66 |
+
tf.keras.layers.Dense(embed_dim),
|
| 67 |
+
])
|
| 68 |
+
self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
| 69 |
+
self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
| 70 |
+
self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
|
| 71 |
+
self.drop1 = tf.keras.layers.Dropout(rate)
|
| 72 |
+
self.drop2 = tf.keras.layers.Dropout(rate)
|
| 73 |
+
self.drop3 = tf.keras.layers.Dropout(rate)
|
| 74 |
+
|
| 75 |
+
def call(self, x, enc_output, training=None):
|
| 76 |
+
attn1 = self.att1(x, x, use_causal_mask=True)
|
| 77 |
+
out1 = self.ln1(x + self.drop1(attn1, training=training))
|
| 78 |
+
attn2 = self.att2(out1, enc_output)
|
| 79 |
+
out2 = self.ln2(out1 + self.drop2(attn2, training=training))
|
| 80 |
+
ffn_out = self.ffn(out2)
|
| 81 |
+
return self.ln3(out2 + self.drop3(ffn_out, training=training))
|
|
|
|
| 82 |
|
| 83 |
# -------------------------------------------------------------------
|
| 84 |
+
# BUILD INFERENCE MODEL
|
| 85 |
# -------------------------------------------------------------------
|
| 86 |
def build_inference_model():
|
| 87 |
+
# 1. Inputs
|
| 88 |
enc_raw_input = tf.keras.Input(shape=(None, 768), name='enc_raw_input')
|
| 89 |
dec_inputs_inf = tf.keras.Input(shape=(None,), dtype=tf.int32, name='dec_inputs_inf')
|
| 90 |
|
| 91 |
+
# 2. Projection Layer (Tên layer phải khớp file train: encoder_projection)
|
| 92 |
enc_out = tf.keras.layers.Dense(CONFIG["EMBED_DIM"], activation="linear", name="encoder_projection")(enc_raw_input)
|
| 93 |
enc_out = tf.keras.layers.Dropout(CONFIG["DROPOUT"], name="encoder_dropout")(enc_out)
|
| 94 |
|
| 95 |
+
# 3. Embeddings
|
| 96 |
dec_token_emb = tf.keras.layers.Embedding(CONFIG["MAX_VOCAB_OUT"], CONFIG["EMBED_DIM"], mask_zero=True, name='dec_token_emb')
|
| 97 |
dec_pos_emb = tf.keras.layers.Embedding(CONFIG["MAX_SUMMARY_LEN"], CONFIG["EMBED_DIM"], name='dec_pos_emb')
|
| 98 |
|
|
|
|
| 106 |
|
| 107 |
dec_emb_inf = tf.keras.layers.Lambda(add_pos_emb_inf, name='dec_emb_plus_pos_inf')(dec_inputs_inf)
|
| 108 |
|
| 109 |
+
# 4. Decoder Blocks
|
| 110 |
dec_out = dec_emb_inf
|
| 111 |
for i in range(CONFIG["NUM_LAYERS"]):
|
| 112 |
block = TransformerDecoderBlock(
|
|
|
|
| 118 |
)
|
| 119 |
dec_out = block(dec_out, enc_out, training=False)
|
| 120 |
|
| 121 |
+
# 5. Output
|
| 122 |
outputs_inf = tf.keras.layers.Dense(CONFIG["MAX_VOCAB_OUT"], activation='softmax', name='output_dense')(dec_out)
|
| 123 |
|
| 124 |
model = tf.keras.Model(inputs=[enc_raw_input, dec_inputs_inf], outputs=outputs_inf, name="inference_decoder_export")
|
|
|
|
| 129 |
# -------------------------------------------------------------------
|
| 130 |
class AbstractiveSummarizer:
|
| 131 |
def __init__(self, model_dir="./models"):
|
|
|
|
|
|
|
|
|
|
| 132 |
self.model_dir = model_dir
|
| 133 |
|
| 134 |
+
# Load PhoBERT (TensorFlow)
|
| 135 |
+
self.phobert_encoder = PhoBERTEncoderTF()
|
| 136 |
self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
|
| 137 |
|
| 138 |
self._load_resources()
|
|
|
|
| 154 |
|
| 155 |
print(f"📥 Loading weights from {weights_path}...")
|
| 156 |
try:
|
| 157 |
+
# Load weights.
|
| 158 |
+
# Quan trọng: Nếu file train lưu "best_model.weights.h5", load vào cấu trúc inference này
|
| 159 |
+
# sẽ bỏ qua các layer của PhoBERT (vì inference model bắt đầu từ enc_raw_input)
|
| 160 |
+
# và chỉ load weights của Projection + Decoder.
|
| 161 |
+
self.decoder_model.load_weights(weights_path, skip_mismatch=True)
|
| 162 |
+
print("✅ Weights loaded successfully (with skip_mismatch=True for safety)!")
|
| 163 |
except Exception as e:
|
| 164 |
print(f"❌ Error loading weights: {e}")
|
| 165 |
|
|
|
|
| 180 |
continue
|
| 181 |
|
| 182 |
dec_inp = np.array([seq])
|
| 183 |
+
# Dự đoán
|
| 184 |
preds = self.decoder_model.predict([enc_out, dec_inp], verbose=0)
|
| 185 |
last_token_probs = preds[0, -1, :]
|
| 186 |
|
|
|
|
| 201 |
return summary
|
| 202 |
|
| 203 |
def summarize_debug(self, text, k=3):
|
| 204 |
+
# 1. Clean
|
| 205 |
text_clean = clean_text_inference(text)
|
| 206 |
+
|
| 207 |
+
# 2. Tokenize Input
|
| 208 |
inp = self.phobert_tokenizer(
|
| 209 |
[text_clean],
|
| 210 |
max_length=CONFIG["MAX_TEXT_LEN"],
|
| 211 |
truncation=True, padding='max_length',
|
| 212 |
+
return_tensors='np' # Quan trọng: trả về numpy cho TF
|
| 213 |
)
|
|
|
|
|
|
|
| 214 |
|
| 215 |
+
# 3. Encode (Qua PhoBERT TF)
|
| 216 |
+
# inp['input_ids'] là numpy array
|
| 217 |
+
enc_out = self.phobert_encoder.encode(inp['input_ids'], inp['attention_mask'])
|
| 218 |
+
|
| 219 |
+
# 4. Beam Search
|
| 220 |
+
seq_ids = self.beam_search(enc_out, k=k)
|
| 221 |
+
|
| 222 |
+
# 5. Decode
|
| 223 |
+
decoded_text = self.tokenizer.sequences_to_texts([seq_ids])[0]
|
| 224 |
summary = decoded_text.replace('startseq', '').replace('endseq', '').strip()
|
| 225 |
|
| 226 |
+
return summary, seq_ids
|