| # fine-tuning Llama-2-7b chat version on PileNER with topNE NEs and N samples per NE (N positive + N negative) | |
| base_model: meta-llama/Llama-2-7b-chat-hf | |
| prompt_template_name: reverse_INST | |
| # using dataset converted from MSEQA format to GenQA format (instruction, input, output) columns | |
| data_path: None | |
| val_data_path: None | |
| select_train_portion: -1 | |
| val_set_size: -1 # if -1 use all validation data | |
| output_dir: None | |
| early_stopping_patience: 5 | |
| #training hyperparams | |
| batch_size: 32 | |
| micro_batch_size: 1 | |
| num_epochs: 10 | |
| learning_rate: 3.0e-4 | |
| cutoff_len: 768 | |
| warmup_steps: 60 | |
| eval_steps: 20 | |
| logging_steps: 5 | |
| max_grad_norm: 1.0 | |
| #lora hyperparams | |
| use_lora: True | |
| lora_alpha: 16 | |
| lora_dropout: 0.05 | |
| lora_r: 8 | |
| lora_target_modules: | |
| - q_proj | |
| - v_proj | |
| - k_proj | |
| - v_proj | |
| #llm hyperparams | |
| # NTP loss only on Response | |
| train_on_inputs: False | |
| group_by_length: True | |
| #quant params | |
| load_8bit: False | |
| load_4bit: False | |
| #general param | |
| save_total_limit: 2 | |
| use_flash_attention: False | |
| shuffle: True | |
| gradient_checkpointing: False |