NLP-HOMEWORK / main.py
yujieyyj's picture
Initial commit
e3af320 verified
import os
import numpy as np
import json
import argparse
from utils_data import load_data, MyDataset
from modeling_bert import BertForSequenceClassification
from transformers import (
AutoConfig,
BertForSequenceClassification,
AutoTokenizer,
EvalPrediction,
Trainer,
TrainingArguments,
default_data_collator,
set_seed,
)
def parse_args():
parser = argparse.ArgumentParser()
parser.add_argument('--data_root', type=str, default='data')
parser.add_argument('--model', type=str, default='bert-base-uncased')
parser.add_argument('--output_dir', type=str, default='experiments')
parser.add_argument('--epoch', type=int, default=2)
parser.add_argument('--lr', type=float, default=2e-5)
parser.add_argument('--bs', type=int, default=32)
parser.add_argument('--max_length', type=int, default=512)
parser.add_argument('--seed', type=int, default=666, help='random seed')
args = parser.parse_args()
return args
def main():
args = parse_args()
print("args",args)
print('====Input Arguments====')
print(json.dumps(vars(args), indent=2, sort_keys=False))
# Set seed before initializing model, for reproduction purpose.
set_seed(args.seed)
# Load pretrained model and tokenizer
config = AutoConfig.from_pretrained(args.model)
tokenizer = AutoTokenizer.from_pretrained(args.model)
model =BertForSequenceClassification.from_pretrained(args.model, config=config)
# Load data
train_data = load_data(args, "train")
train_dataset = MyDataset(train_data, tokenizer, args.max_length, is_test=False)
eval_data = load_data(args, "val")
eval_dataset = MyDataset(eval_data, tokenizer, args.max_length, is_test=False)
test_data = load_data(args, "test")
test_dataset = MyDataset(test_data, tokenizer, args.max_length, is_test=True)
def compute_metrics(p: EvalPrediction):
preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
preds = np.argmax(preds, axis=1)
correct = ((preds == p.label_ids).sum()).item()
return {'accuracy': 1.0*correct/len(preds)}
training_args = TrainingArguments(
output_dir = args.output_dir,
do_train=True,
do_eval=True,
do_predict=True,
logging_strategy="steps",
save_strategy="epoch",
learning_rate= args.lr,
per_device_train_batch_size=args.bs,
per_device_eval_batch_size=args.bs,
num_train_epochs=args.epoch,
report_to="none"
)
# Initialize our Trainer
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset if training_args.do_train else None,
eval_dataset=eval_dataset if training_args.do_eval else None,
compute_metrics=compute_metrics,
tokenizer=tokenizer,
data_collator=default_data_collator,
)
# Training
if training_args.do_train:
train_result = trainer.train()
metrics = train_result.metrics
trainer.save_model() # Saves the tokenizer too for easy upload
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()
# Evaluation
if training_args.do_eval:
metrics = trainer.evaluate(eval_dataset=eval_dataset)
trainer.log_metrics("eval", metrics)
trainer.save_metrics("eval", metrics)
if training_args.do_predict:
predictions = trainer.predict(test_dataset, metric_key_prefix="predict").predictions
predictions = np.argmax(predictions, axis=1)
output_predict_file = os.path.join(args.output_dir, "predict_results.txt")
if trainer.is_world_process_zero():
with open(output_predict_file, "w") as writer:
writer.write("index\tprediction\n")
for index, item in enumerate(predictions):
writer.write(f"{index}\t{item}\n")
if __name__ == "__main__":
main()