HIEHEU commited on
Commit
2093180
·
verified ·
1 Parent(s): 8187ed6

Update py/abstractive.py

Browse files
Files changed (1) hide show
  1. py/abstractive.py +64 -70
py/abstractive.py CHANGED
@@ -5,8 +5,8 @@ import re
5
  import pickle
6
  import unicodedata
7
  import numpy as np
8
- import torch
9
- from transformers import AutoTokenizer, AutoModel
10
 
11
  # -------------------------------------------------------------------
12
  # CONFIG
@@ -21,7 +21,7 @@ CONFIG = {
21
  "FF_DIM": 2048,
22
  "DROPOUT": 0.2,
23
  "TOKENIZER_FILE": "decoder_tokenizer_re.pkl",
24
- "WEIGHTS_FILE": "decoder_only.weights.h5"
25
  }
26
 
27
  # -------------------------------------------------------------------
@@ -38,81 +38,61 @@ def clean_text_inference(text: str) -> str:
38
  return text
39
 
40
  # -------------------------------------------------------------------
41
- # PHOBERT ENCODER (PyTorch) - ĐÃ SỬA LỖI META TENSOR TẠI ĐÂY
42
  # -------------------------------------------------------------------
43
- class PhoBERTEncoderTorch:
44
  def __init__(self):
45
- print(">>> Loading PhoBERT (PyTorch)...")
46
- self.device = torch.device("cpu")
47
-
48
- # === FIX: Thêm low_cpu_mem_usage=False ===
49
- # Lỗi "meta tensor" xảy ra do thư viện accelerate cố gắng tối ưu RAM bằng cách tạo model rỗng.
50
- # low_cpu_mem_usage=False buộc model load đầy đủ weights vào RAM ngay lập tức.
51
- try:
52
- self.model = AutoModel.from_pretrained("vinai/phobert-base", low_cpu_mem_usage=False).to(self.device)
53
- except TypeError:
54
- # Fallback nếu phiên bản transformers cũ không hỗ trợ tham số này
55
- self.model = AutoModel.from_pretrained("vinai/phobert-base").to(self.device)
56
-
57
- self.model.eval()
58
  print(">>> PhoBERT loaded successfully.")
59
 
60
  def encode(self, input_ids, attention_mask):
61
- with torch.no_grad():
62
- ids = torch.tensor(input_ids).to(self.device)
63
- mask = torch.tensor(attention_mask).to(self.device)
64
- outputs = self.model(ids, attention_mask=mask)
65
- return outputs.last_hidden_state.cpu().numpy()
66
-
67
- # -------------------------------------------------------------------
68
- # SAFE IMPORT TENSORFLOW
69
- # -------------------------------------------------------------------
70
- TF_AVAILABLE = True
71
- try:
72
- import tensorflow as tf
73
- except Exception as e:
74
- TF_AVAILABLE = False
75
- _TF_ERR = e
76
- tf = None
77
 
78
  # -------------------------------------------------------------------
79
  # DECODER TRANSFORMER (TensorFlow)
80
  # -------------------------------------------------------------------
81
- if TF_AVAILABLE:
82
- class TransformerDecoderBlock(tf.keras.layers.Layer):
83
- def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
84
- super().__init__(**kwargs)
85
- self.att1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
86
- self.att2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
87
- self.ffn = tf.keras.Sequential([
88
- tf.keras.layers.Dense(ff_dim, activation="relu"),
89
- tf.keras.layers.Dense(embed_dim),
90
- ])
91
- self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
92
- self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
93
- self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
94
- self.drop1 = tf.keras.layers.Dropout(rate)
95
- self.drop2 = tf.keras.layers.Dropout(rate)
96
- self.drop3 = tf.keras.layers.Dropout(rate)
97
-
98
- def call(self, x, enc_output, training=None):
99
- attn1 = self.att1(x, x, use_causal_mask=True)
100
- out1 = self.ln1(x + self.drop1(attn1, training=training))
101
- attn2 = self.att2(out1, enc_output)
102
- out2 = self.ln2(out1 + self.drop2(attn2, training=training))
103
- ffn_out = self.ffn(out2)
104
- return self.ln3(out2 + self.drop3(ffn_out, training=training))
105
 
106
  # -------------------------------------------------------------------
107
- # BUILD DECODER MODEL
108
  # -------------------------------------------------------------------
109
  def build_inference_model():
 
110
  enc_raw_input = tf.keras.Input(shape=(None, 768), name='enc_raw_input')
111
  dec_inputs_inf = tf.keras.Input(shape=(None,), dtype=tf.int32, name='dec_inputs_inf')
112
 
 
113
  enc_out = tf.keras.layers.Dense(CONFIG["EMBED_DIM"], activation="linear", name="encoder_projection")(enc_raw_input)
114
  enc_out = tf.keras.layers.Dropout(CONFIG["DROPOUT"], name="encoder_dropout")(enc_out)
115
 
 
116
  dec_token_emb = tf.keras.layers.Embedding(CONFIG["MAX_VOCAB_OUT"], CONFIG["EMBED_DIM"], mask_zero=True, name='dec_token_emb')
117
  dec_pos_emb = tf.keras.layers.Embedding(CONFIG["MAX_SUMMARY_LEN"], CONFIG["EMBED_DIM"], name='dec_pos_emb')
118
 
@@ -126,6 +106,7 @@ def build_inference_model():
126
 
127
  dec_emb_inf = tf.keras.layers.Lambda(add_pos_emb_inf, name='dec_emb_plus_pos_inf')(dec_inputs_inf)
128
 
 
129
  dec_out = dec_emb_inf
130
  for i in range(CONFIG["NUM_LAYERS"]):
131
  block = TransformerDecoderBlock(
@@ -137,6 +118,7 @@ def build_inference_model():
137
  )
138
  dec_out = block(dec_out, enc_out, training=False)
139
 
 
140
  outputs_inf = tf.keras.layers.Dense(CONFIG["MAX_VOCAB_OUT"], activation='softmax', name='output_dense')(dec_out)
141
 
142
  model = tf.keras.Model(inputs=[enc_raw_input, dec_inputs_inf], outputs=outputs_inf, name="inference_decoder_export")
@@ -147,12 +129,10 @@ def build_inference_model():
147
  # -------------------------------------------------------------------
148
  class AbstractiveSummarizer:
149
  def __init__(self, model_dir="./models"):
150
- if not TF_AVAILABLE:
151
- raise RuntimeError(f"TensorFlow không khả dụng: {_TF_ERR}")
152
-
153
  self.model_dir = model_dir
154
 
155
- self.phobert = PhoBERTEncoderTorch()
 
156
  self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
157
 
158
  self._load_resources()
@@ -174,8 +154,12 @@ class AbstractiveSummarizer:
174
 
175
  print(f"📥 Loading weights from {weights_path}...")
176
  try:
177
- self.decoder_model.load_weights(weights_path)
178
- print(" Weights loaded successfully!")
 
 
 
 
179
  except Exception as e:
180
  print(f"❌ Error loading weights: {e}")
181
 
@@ -196,6 +180,7 @@ class AbstractiveSummarizer:
196
  continue
197
 
198
  dec_inp = np.array([seq])
 
199
  preds = self.decoder_model.predict([enc_out, dec_inp], verbose=0)
200
  last_token_probs = preds[0, -1, :]
201
 
@@ -216,17 +201,26 @@ class AbstractiveSummarizer:
216
  return summary
217
 
218
  def summarize_debug(self, text, k=3):
 
219
  text_clean = clean_text_inference(text)
 
 
220
  inp = self.phobert_tokenizer(
221
  [text_clean],
222
  max_length=CONFIG["MAX_TEXT_LEN"],
223
  truncation=True, padding='max_length',
224
- return_tensors='np'
225
  )
226
- enc_out = self.phobert.encode(inp['input_ids'], inp['attention_mask'])
227
- seq = self.beam_search(enc_out, k=k)
228
 
229
- decoded_text = self.tokenizer.sequences_to_texts([seq])[0]
 
 
 
 
 
 
 
 
230
  summary = decoded_text.replace('startseq', '').replace('endseq', '').strip()
231
 
232
- return summary, seq
 
5
  import pickle
6
  import unicodedata
7
  import numpy as np
8
+ import tensorflow as tf # Dùng TF hoàn toàn
9
+ from transformers import AutoTokenizer, TFAutoModel # Dùng TFAutoModel thay vì AutoModel
10
 
11
  # -------------------------------------------------------------------
12
  # CONFIG
 
21
  "FF_DIM": 2048,
22
  "DROPOUT": 0.2,
23
  "TOKENIZER_FILE": "decoder_tokenizer_re.pkl",
24
+ "WEIGHTS_FILE": "best_model.weights.h5" # Đã đổi tên khớp với file train của bạn
25
  }
26
 
27
  # -------------------------------------------------------------------
 
38
  return text
39
 
40
  # -------------------------------------------------------------------
41
+ # PHOBERT ENCODER (TENSORFLOW VERSION)
42
  # -------------------------------------------------------------------
43
+ class PhoBERTEncoderTF:
44
  def __init__(self):
45
+ print(">>> Loading PhoBERT (TensorFlow)...")
46
+ # Load đúng model TF giống lúc train
47
+ self.model = TFAutoModel.from_pretrained("vinai/phobert-base")
 
 
 
 
 
 
 
 
 
 
48
  print(">>> PhoBERT loaded successfully.")
49
 
50
  def encode(self, input_ids, attention_mask):
51
+ # Input là numpy array, TFAutoModel nhận trực tiếp
52
+ outputs = self.model(input_ids=input_ids, attention_mask=attention_mask)
53
+ # Trả về last_hidden_state dạng numpy hoặc tensor
54
+ return outputs.last_hidden_state
 
 
 
 
 
 
 
 
 
 
 
 
55
 
56
  # -------------------------------------------------------------------
57
  # DECODER TRANSFORMER (TensorFlow)
58
  # -------------------------------------------------------------------
59
+ class TransformerDecoderBlock(tf.keras.layers.Layer):
60
+ def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
61
+ super().__init__(**kwargs)
62
+ self.att1 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
63
+ self.att2 = tf.keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
64
+ self.ffn = tf.keras.Sequential([
65
+ tf.keras.layers.Dense(ff_dim, activation="relu"),
66
+ tf.keras.layers.Dense(embed_dim),
67
+ ])
68
+ self.ln1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
69
+ self.ln2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
70
+ self.ln3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
71
+ self.drop1 = tf.keras.layers.Dropout(rate)
72
+ self.drop2 = tf.keras.layers.Dropout(rate)
73
+ self.drop3 = tf.keras.layers.Dropout(rate)
74
+
75
+ def call(self, x, enc_output, training=None):
76
+ attn1 = self.att1(x, x, use_causal_mask=True)
77
+ out1 = self.ln1(x + self.drop1(attn1, training=training))
78
+ attn2 = self.att2(out1, enc_output)
79
+ out2 = self.ln2(out1 + self.drop2(attn2, training=training))
80
+ ffn_out = self.ffn(out2)
81
+ return self.ln3(out2 + self.drop3(ffn_out, training=training))
 
82
 
83
  # -------------------------------------------------------------------
84
+ # BUILD INFERENCE MODEL
85
  # -------------------------------------------------------------------
86
  def build_inference_model():
87
+ # 1. Inputs
88
  enc_raw_input = tf.keras.Input(shape=(None, 768), name='enc_raw_input')
89
  dec_inputs_inf = tf.keras.Input(shape=(None,), dtype=tf.int32, name='dec_inputs_inf')
90
 
91
+ # 2. Projection Layer (Tên layer phải khớp file train: encoder_projection)
92
  enc_out = tf.keras.layers.Dense(CONFIG["EMBED_DIM"], activation="linear", name="encoder_projection")(enc_raw_input)
93
  enc_out = tf.keras.layers.Dropout(CONFIG["DROPOUT"], name="encoder_dropout")(enc_out)
94
 
95
+ # 3. Embeddings
96
  dec_token_emb = tf.keras.layers.Embedding(CONFIG["MAX_VOCAB_OUT"], CONFIG["EMBED_DIM"], mask_zero=True, name='dec_token_emb')
97
  dec_pos_emb = tf.keras.layers.Embedding(CONFIG["MAX_SUMMARY_LEN"], CONFIG["EMBED_DIM"], name='dec_pos_emb')
98
 
 
106
 
107
  dec_emb_inf = tf.keras.layers.Lambda(add_pos_emb_inf, name='dec_emb_plus_pos_inf')(dec_inputs_inf)
108
 
109
+ # 4. Decoder Blocks
110
  dec_out = dec_emb_inf
111
  for i in range(CONFIG["NUM_LAYERS"]):
112
  block = TransformerDecoderBlock(
 
118
  )
119
  dec_out = block(dec_out, enc_out, training=False)
120
 
121
+ # 5. Output
122
  outputs_inf = tf.keras.layers.Dense(CONFIG["MAX_VOCAB_OUT"], activation='softmax', name='output_dense')(dec_out)
123
 
124
  model = tf.keras.Model(inputs=[enc_raw_input, dec_inputs_inf], outputs=outputs_inf, name="inference_decoder_export")
 
129
  # -------------------------------------------------------------------
130
  class AbstractiveSummarizer:
131
  def __init__(self, model_dir="./models"):
 
 
 
132
  self.model_dir = model_dir
133
 
134
+ # Load PhoBERT (TensorFlow)
135
+ self.phobert_encoder = PhoBERTEncoderTF()
136
  self.phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")
137
 
138
  self._load_resources()
 
154
 
155
  print(f"📥 Loading weights from {weights_path}...")
156
  try:
157
+ # Load weights.
158
+ # Quan trọng: Nếu file train lưu "best_model.weights.h5", load vào cấu trúc inference này
159
+ # sẽ bỏ qua các layer của PhoBERT (vì inference model bắt đầu từ enc_raw_input)
160
+ # và chỉ load weights của Projection + Decoder.
161
+ self.decoder_model.load_weights(weights_path, skip_mismatch=True)
162
+ print("✅ Weights loaded successfully (with skip_mismatch=True for safety)!")
163
  except Exception as e:
164
  print(f"❌ Error loading weights: {e}")
165
 
 
180
  continue
181
 
182
  dec_inp = np.array([seq])
183
+ # Dự đoán
184
  preds = self.decoder_model.predict([enc_out, dec_inp], verbose=0)
185
  last_token_probs = preds[0, -1, :]
186
 
 
201
  return summary
202
 
203
  def summarize_debug(self, text, k=3):
204
+ # 1. Clean
205
  text_clean = clean_text_inference(text)
206
+
207
+ # 2. Tokenize Input
208
  inp = self.phobert_tokenizer(
209
  [text_clean],
210
  max_length=CONFIG["MAX_TEXT_LEN"],
211
  truncation=True, padding='max_length',
212
+ return_tensors='np' # Quan trọng: trả về numpy cho TF
213
  )
 
 
214
 
215
+ # 3. Encode (Qua PhoBERT TF)
216
+ # inp['input_ids'] là numpy array
217
+ enc_out = self.phobert_encoder.encode(inp['input_ids'], inp['attention_mask'])
218
+
219
+ # 4. Beam Search
220
+ seq_ids = self.beam_search(enc_out, k=k)
221
+
222
+ # 5. Decode
223
+ decoded_text = self.tokenizer.sequences_to_texts([seq_ids])[0]
224
  summary = decoded_text.replace('startseq', '').replace('endseq', '').strip()
225
 
226
+ return summary, seq_ids