jerryzh168 commited on
Commit
067a1f2
·
verified ·
1 Parent(s): 127b885

Upload Gemma3ForConditionalGeneration

Browse files
config.json CHANGED
@@ -3,6 +3,7 @@
3
  "Gemma3ForConditionalGeneration"
4
  ],
5
  "boi_token_index": 255999,
 
6
  "eoi_token_index": 256000,
7
  "eos_token_id": [
8
  1,
@@ -13,12 +14,15 @@
13
  "mm_tokens_per_image": 256,
14
  "model_type": "gemma3",
15
  "quantization_config": {
 
16
  "modules_to_not_convert": null,
17
  "quant_method": "torchao",
18
  "quant_type": {
19
  "default": {
20
  "_data": {
21
  "group_size": 128,
 
 
22
  "layout": {
23
  "_data": {
24
  "inner_k_tiles": 8
@@ -26,10 +30,6 @@
26
  "_type": "TensorCoreTiledLayout",
27
  "_version": 1
28
  },
29
- "packing_format": {
30
- "_data": "PLAIN",
31
- "_type": "PackingFormat"
32
- },
33
  "preserve_zero": null,
34
  "set_inductor_config": true,
35
  "use_hqq": false,
@@ -42,19 +42,85 @@
42
  "_version": 2
43
  }
44
  },
45
- "quant_type_kwargs": {}
 
46
  },
47
  "text_config": {
 
48
  "attention_bias": false,
49
  "attention_dropout": 0.0,
50
  "attn_logit_softcapping": null,
51
- "cache_implementation": "hybrid",
52
  "final_logit_softcapping": null,
53
  "head_dim": 128,
54
  "hidden_activation": "gelu_pytorch_tanh",
55
  "hidden_size": 5376,
56
  "initializer_range": 0.02,
57
  "intermediate_size": 21504,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  "max_position_embeddings": 131072,
59
  "model_type": "gemma3_text",
60
  "num_attention_heads": 32,
@@ -69,15 +135,13 @@
69
  },
70
  "rope_theta": 1000000.0,
71
  "sliding_window": 1024,
72
- "sliding_window_pattern": 6,
73
- "torch_dtype": "bfloat16",
74
  "use_cache": true,
75
  "vocab_size": 262208
76
  },
77
- "torch_dtype": "bfloat16",
78
- "transformers_version": "4.52.0.dev0",
79
  "vision_config": {
80
  "attention_dropout": 0.0,
 
81
  "hidden_act": "gelu_pytorch_tanh",
82
  "hidden_size": 1152,
83
  "image_size": 896,
@@ -88,7 +152,6 @@
88
  "num_channels": 3,
89
  "num_hidden_layers": 27,
90
  "patch_size": 14,
91
- "torch_dtype": "bfloat16",
92
  "vision_use_head": false
93
  }
94
  }
 
3
  "Gemma3ForConditionalGeneration"
4
  ],
5
  "boi_token_index": 255999,
6
+ "dtype": "bfloat16",
7
  "eoi_token_index": 256000,
8
  "eos_token_id": [
9
  1,
 
14
  "mm_tokens_per_image": 256,
15
  "model_type": "gemma3",
16
  "quantization_config": {
17
+ "include_input_output_embeddings": false,
18
  "modules_to_not_convert": null,
19
  "quant_method": "torchao",
20
  "quant_type": {
21
  "default": {
22
  "_data": {
23
  "group_size": 128,
24
+ "int4_choose_qparams_algorithm": "hqq",
25
+ "int4_packing_format": "tile_packed_to_4d",
26
  "layout": {
27
  "_data": {
28
  "inner_k_tiles": 8
 
30
  "_type": "TensorCoreTiledLayout",
31
  "_version": 1
32
  },
 
 
 
 
33
  "preserve_zero": null,
34
  "set_inductor_config": true,
35
  "use_hqq": false,
 
42
  "_version": 2
43
  }
44
  },
45
+ "quant_type_kwargs": {},
46
+ "untie_embedding_weights": false
47
  },
48
  "text_config": {
49
+ "_sliding_window_pattern": 6,
50
  "attention_bias": false,
51
  "attention_dropout": 0.0,
52
  "attn_logit_softcapping": null,
53
+ "dtype": "bfloat16",
54
  "final_logit_softcapping": null,
55
  "head_dim": 128,
56
  "hidden_activation": "gelu_pytorch_tanh",
57
  "hidden_size": 5376,
58
  "initializer_range": 0.02,
59
  "intermediate_size": 21504,
60
+ "layer_types": [
61
+ "sliding_attention",
62
+ "sliding_attention",
63
+ "sliding_attention",
64
+ "sliding_attention",
65
+ "sliding_attention",
66
+ "full_attention",
67
+ "sliding_attention",
68
+ "sliding_attention",
69
+ "sliding_attention",
70
+ "sliding_attention",
71
+ "sliding_attention",
72
+ "full_attention",
73
+ "sliding_attention",
74
+ "sliding_attention",
75
+ "sliding_attention",
76
+ "sliding_attention",
77
+ "sliding_attention",
78
+ "full_attention",
79
+ "sliding_attention",
80
+ "sliding_attention",
81
+ "sliding_attention",
82
+ "sliding_attention",
83
+ "sliding_attention",
84
+ "full_attention",
85
+ "sliding_attention",
86
+ "sliding_attention",
87
+ "sliding_attention",
88
+ "sliding_attention",
89
+ "sliding_attention",
90
+ "full_attention",
91
+ "sliding_attention",
92
+ "sliding_attention",
93
+ "sliding_attention",
94
+ "sliding_attention",
95
+ "sliding_attention",
96
+ "full_attention",
97
+ "sliding_attention",
98
+ "sliding_attention",
99
+ "sliding_attention",
100
+ "sliding_attention",
101
+ "sliding_attention",
102
+ "full_attention",
103
+ "sliding_attention",
104
+ "sliding_attention",
105
+ "sliding_attention",
106
+ "sliding_attention",
107
+ "sliding_attention",
108
+ "full_attention",
109
+ "sliding_attention",
110
+ "sliding_attention",
111
+ "sliding_attention",
112
+ "sliding_attention",
113
+ "sliding_attention",
114
+ "full_attention",
115
+ "sliding_attention",
116
+ "sliding_attention",
117
+ "sliding_attention",
118
+ "sliding_attention",
119
+ "sliding_attention",
120
+ "full_attention",
121
+ "sliding_attention",
122
+ "sliding_attention"
123
+ ],
124
  "max_position_embeddings": 131072,
125
  "model_type": "gemma3_text",
126
  "num_attention_heads": 32,
 
135
  },
136
  "rope_theta": 1000000.0,
137
  "sliding_window": 1024,
 
 
138
  "use_cache": true,
139
  "vocab_size": 262208
140
  },
141
+ "transformers_version": "4.56.1",
 
142
  "vision_config": {
143
  "attention_dropout": 0.0,
144
+ "dtype": "bfloat16",
145
  "hidden_act": "gelu_pytorch_tanh",
146
  "hidden_size": 1152,
147
  "image_size": 896,
 
152
  "num_channels": 3,
153
  "num_hidden_layers": 27,
154
  "patch_size": 14,
 
155
  "vision_use_head": false
156
  }
157
  }
generation_config.json CHANGED
@@ -9,5 +9,5 @@
9
  "pad_token_id": 0,
10
  "top_k": 64,
11
  "top_p": 0.95,
12
- "transformers_version": "4.52.0.dev0"
13
  }
 
9
  "pad_token_id": 0,
10
  "top_k": 64,
11
  "top_p": 0.95,
12
+ "transformers_version": "4.56.1"
13
  }
pytorch_model-00001-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:408653044aa52ff49fa0f9814868d3659abd40279dfb0abf52153d8e71969e5d
3
- size 4952199570
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c971b60644fa3fb6e9988643dbd748f5217cd93d0d7065faa711a4874707b01
3
+ size 4993517312
pytorch_model-00002-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8965dabce3fdd50df83d9acfd8122ff3e6e8b0969493a3b5714f192dc3dce492
3
- size 4984651999
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f274b6812d69241cef73232c4b1644668bb660d33ee278ca7e0ae9a74e1a91b
3
+ size 4975728231
pytorch_model-00003-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1d336f78c447f68412a3cff5d23fae9efe4267c36119b5c4f8053cc0bf2b2ae7
3
- size 4984652195
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01961d4ecfcca8e5d8103145e548fa8157f146dede2db5ea2fba44788ed80165
3
+ size 4975728231
pytorch_model-00004-of-00004.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:85723169e9874620cc307c6581926afd51f0730c1c2ae25900313857ded569ad
3
- size 1939459663
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:08691fb7c3930fbb501981d2c0f990589c3852da34c032a2f0a8a77053f20a80
3
+ size 3325514970
pytorch_model.bin.index.json CHANGED
@@ -1,6 +1,7 @@
1
  {
2
  "metadata": {
3
- "total_size": 16860083872
 
4
  },
5
  "weight_map": {
6
  "language_model.lm_head.weight": "pytorch_model-00001-of-00004.bin",
@@ -265,45 +266,45 @@
265
  "language_model.model.layers.26.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
266
  "language_model.model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
267
  "language_model.model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
268
- "language_model.model.layers.27.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
269
- "language_model.model.layers.27.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
270
  "language_model.model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
271
- "language_model.model.layers.27.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
272
- "language_model.model.layers.27.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
273
- "language_model.model.layers.27.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
274
- "language_model.model.layers.27.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
275
  "language_model.model.layers.27.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
276
  "language_model.model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
277
  "language_model.model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
278
  "language_model.model.layers.27.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
279
  "language_model.model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
280
  "language_model.model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
281
- "language_model.model.layers.28.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
282
- "language_model.model.layers.28.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
283
- "language_model.model.layers.28.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
284
- "language_model.model.layers.28.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
285
- "language_model.model.layers.28.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
286
- "language_model.model.layers.28.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
287
- "language_model.model.layers.28.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
288
- "language_model.model.layers.28.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
289
- "language_model.model.layers.28.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
290
- "language_model.model.layers.28.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
291
- "language_model.model.layers.28.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
292
- "language_model.model.layers.28.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
293
- "language_model.model.layers.28.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
294
- "language_model.model.layers.29.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
295
- "language_model.model.layers.29.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
296
- "language_model.model.layers.29.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
297
- "language_model.model.layers.29.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
298
- "language_model.model.layers.29.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
299
- "language_model.model.layers.29.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
300
- "language_model.model.layers.29.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
301
- "language_model.model.layers.29.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
302
- "language_model.model.layers.29.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
303
- "language_model.model.layers.29.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
304
- "language_model.model.layers.29.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
305
- "language_model.model.layers.29.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
306
- "language_model.model.layers.29.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
307
  "language_model.model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
308
  "language_model.model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
309
  "language_model.model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
@@ -319,17 +320,17 @@
319
  "language_model.model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
320
  "language_model.model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
321
  "language_model.model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
322
- "language_model.model.layers.30.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
323
  "language_model.model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
324
  "language_model.model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
325
  "language_model.model.layers.30.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
326
  "language_model.model.layers.30.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
327
- "language_model.model.layers.30.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
328
- "language_model.model.layers.30.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
329
- "language_model.model.layers.30.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
330
- "language_model.model.layers.30.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
331
- "language_model.model.layers.30.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
332
- "language_model.model.layers.30.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
333
  "language_model.model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
334
  "language_model.model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
335
  "language_model.model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
@@ -564,32 +565,32 @@
564
  "language_model.model.layers.47.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
565
  "language_model.model.layers.47.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
566
  "language_model.model.layers.47.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
567
- "language_model.model.layers.48.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
568
- "language_model.model.layers.48.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
569
- "language_model.model.layers.48.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
570
- "language_model.model.layers.48.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
571
- "language_model.model.layers.48.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
572
- "language_model.model.layers.48.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
573
- "language_model.model.layers.48.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
574
  "language_model.model.layers.48.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
575
  "language_model.model.layers.48.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
576
  "language_model.model.layers.48.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
577
  "language_model.model.layers.48.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
578
  "language_model.model.layers.48.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
579
  "language_model.model.layers.48.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
580
- "language_model.model.layers.49.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
581
- "language_model.model.layers.49.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
582
- "language_model.model.layers.49.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
583
- "language_model.model.layers.49.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
584
- "language_model.model.layers.49.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
585
- "language_model.model.layers.49.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
586
- "language_model.model.layers.49.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
587
- "language_model.model.layers.49.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
588
- "language_model.model.layers.49.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
589
- "language_model.model.layers.49.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
590
- "language_model.model.layers.49.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
591
- "language_model.model.layers.49.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
592
- "language_model.model.layers.49.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
593
  "language_model.model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
594
  "language_model.model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
595
  "language_model.model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
@@ -603,45 +604,45 @@
603
  "language_model.model.layers.5.self_attn.q_norm.weight": "pytorch_model-00001-of-00004.bin",
604
  "language_model.model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
605
  "language_model.model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
606
- "language_model.model.layers.50.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
607
- "language_model.model.layers.50.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
608
- "language_model.model.layers.50.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
609
- "language_model.model.layers.50.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
610
- "language_model.model.layers.50.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
611
- "language_model.model.layers.50.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
612
- "language_model.model.layers.50.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
613
- "language_model.model.layers.50.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
614
- "language_model.model.layers.50.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
615
- "language_model.model.layers.50.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
616
- "language_model.model.layers.50.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
617
- "language_model.model.layers.50.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
618
- "language_model.model.layers.50.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
619
- "language_model.model.layers.51.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
620
- "language_model.model.layers.51.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
621
- "language_model.model.layers.51.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
622
- "language_model.model.layers.51.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
623
- "language_model.model.layers.51.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
624
- "language_model.model.layers.51.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
625
- "language_model.model.layers.51.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
626
- "language_model.model.layers.51.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
627
- "language_model.model.layers.51.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
628
- "language_model.model.layers.51.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
629
- "language_model.model.layers.51.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
630
- "language_model.model.layers.51.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
631
- "language_model.model.layers.51.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
632
- "language_model.model.layers.52.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
633
- "language_model.model.layers.52.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
634
- "language_model.model.layers.52.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
635
- "language_model.model.layers.52.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
636
- "language_model.model.layers.52.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
637
- "language_model.model.layers.52.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
638
- "language_model.model.layers.52.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
639
- "language_model.model.layers.52.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
640
- "language_model.model.layers.52.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
641
- "language_model.model.layers.52.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
642
- "language_model.model.layers.52.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
643
- "language_model.model.layers.52.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
644
- "language_model.model.layers.52.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
645
  "language_model.model.layers.53.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
646
  "language_model.model.layers.53.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
647
  "language_model.model.layers.53.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
@@ -649,12 +650,12 @@
649
  "language_model.model.layers.53.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
650
  "language_model.model.layers.53.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
651
  "language_model.model.layers.53.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
652
- "language_model.model.layers.53.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
653
- "language_model.model.layers.53.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
654
- "language_model.model.layers.53.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
655
- "language_model.model.layers.53.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
656
- "language_model.model.layers.53.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
657
- "language_model.model.layers.53.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
658
  "language_model.model.layers.54.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
659
  "language_model.model.layers.54.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
660
  "language_model.model.layers.54.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
@@ -733,13 +734,13 @@
733
  "language_model.model.layers.59.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
734
  "language_model.model.layers.59.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
735
  "language_model.model.layers.59.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
736
- "language_model.model.layers.6.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
737
- "language_model.model.layers.6.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
738
  "language_model.model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
739
  "language_model.model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
740
- "language_model.model.layers.6.post_attention_layernorm.weight": "pytorch_model-00001-of-00004.bin",
741
- "language_model.model.layers.6.post_feedforward_layernorm.weight": "pytorch_model-00001-of-00004.bin",
742
- "language_model.model.layers.6.pre_feedforward_layernorm.weight": "pytorch_model-00001-of-00004.bin",
743
  "language_model.model.layers.6.self_attn.k_norm.weight": "pytorch_model-00001-of-00004.bin",
744
  "language_model.model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
745
  "language_model.model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
@@ -774,17 +775,17 @@
774
  "language_model.model.layers.61.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
775
  "language_model.model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
776
  "language_model.model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
777
- "language_model.model.layers.7.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
778
- "language_model.model.layers.7.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
779
  "language_model.model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
780
  "language_model.model.layers.7.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
781
  "language_model.model.layers.7.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
782
- "language_model.model.layers.7.self_attn.k_norm.weight": "pytorch_model-00001-of-00004.bin",
783
- "language_model.model.layers.7.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
784
- "language_model.model.layers.7.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
785
- "language_model.model.layers.7.self_attn.q_norm.weight": "pytorch_model-00001-of-00004.bin",
786
- "language_model.model.layers.7.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
787
- "language_model.model.layers.7.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
788
  "language_model.model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
789
  "language_model.model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
790
  "language_model.model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
 
1
  {
2
  "metadata": {
3
+ "total_parameters": 27432406640,
4
+ "total_size": 18269776096
5
  },
6
  "weight_map": {
7
  "language_model.lm_head.weight": "pytorch_model-00001-of-00004.bin",
 
266
  "language_model.model.layers.26.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
267
  "language_model.model.layers.26.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
268
  "language_model.model.layers.26.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
269
+ "language_model.model.layers.27.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
270
+ "language_model.model.layers.27.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
271
  "language_model.model.layers.27.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
272
+ "language_model.model.layers.27.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
273
+ "language_model.model.layers.27.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
274
+ "language_model.model.layers.27.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
275
+ "language_model.model.layers.27.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
276
  "language_model.model.layers.27.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
277
  "language_model.model.layers.27.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
278
  "language_model.model.layers.27.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
279
  "language_model.model.layers.27.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
280
  "language_model.model.layers.27.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
281
  "language_model.model.layers.27.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
282
+ "language_model.model.layers.28.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
283
+ "language_model.model.layers.28.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
284
+ "language_model.model.layers.28.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
285
+ "language_model.model.layers.28.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
286
+ "language_model.model.layers.28.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
287
+ "language_model.model.layers.28.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
288
+ "language_model.model.layers.28.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
289
+ "language_model.model.layers.28.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
290
+ "language_model.model.layers.28.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
291
+ "language_model.model.layers.28.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
292
+ "language_model.model.layers.28.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
293
+ "language_model.model.layers.28.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
294
+ "language_model.model.layers.28.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
295
+ "language_model.model.layers.29.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
296
+ "language_model.model.layers.29.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
297
+ "language_model.model.layers.29.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
298
+ "language_model.model.layers.29.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
299
+ "language_model.model.layers.29.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
300
+ "language_model.model.layers.29.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
301
+ "language_model.model.layers.29.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
302
+ "language_model.model.layers.29.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
303
+ "language_model.model.layers.29.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
304
+ "language_model.model.layers.29.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
305
+ "language_model.model.layers.29.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
306
+ "language_model.model.layers.29.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
307
+ "language_model.model.layers.29.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
308
  "language_model.model.layers.3.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
309
  "language_model.model.layers.3.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
310
  "language_model.model.layers.3.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
 
320
  "language_model.model.layers.3.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
321
  "language_model.model.layers.30.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
322
  "language_model.model.layers.30.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
323
+ "language_model.model.layers.30.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
324
  "language_model.model.layers.30.mlp.up_proj.weight": "pytorch_model-00003-of-00004.bin",
325
  "language_model.model.layers.30.post_attention_layernorm.weight": "pytorch_model-00003-of-00004.bin",
326
  "language_model.model.layers.30.post_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
327
  "language_model.model.layers.30.pre_feedforward_layernorm.weight": "pytorch_model-00003-of-00004.bin",
328
+ "language_model.model.layers.30.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
329
+ "language_model.model.layers.30.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
330
+ "language_model.model.layers.30.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
331
+ "language_model.model.layers.30.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
332
+ "language_model.model.layers.30.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
333
+ "language_model.model.layers.30.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
334
  "language_model.model.layers.31.input_layernorm.weight": "pytorch_model-00003-of-00004.bin",
335
  "language_model.model.layers.31.mlp.down_proj.weight": "pytorch_model-00003-of-00004.bin",
336
  "language_model.model.layers.31.mlp.gate_proj.weight": "pytorch_model-00003-of-00004.bin",
 
565
  "language_model.model.layers.47.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
566
  "language_model.model.layers.47.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
567
  "language_model.model.layers.47.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
568
+ "language_model.model.layers.48.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
569
+ "language_model.model.layers.48.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
570
+ "language_model.model.layers.48.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
571
+ "language_model.model.layers.48.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
572
+ "language_model.model.layers.48.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
573
+ "language_model.model.layers.48.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
574
+ "language_model.model.layers.48.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
575
  "language_model.model.layers.48.self_attn.k_norm.weight": "pytorch_model-00003-of-00004.bin",
576
  "language_model.model.layers.48.self_attn.k_proj.weight": "pytorch_model-00003-of-00004.bin",
577
  "language_model.model.layers.48.self_attn.o_proj.weight": "pytorch_model-00003-of-00004.bin",
578
  "language_model.model.layers.48.self_attn.q_norm.weight": "pytorch_model-00003-of-00004.bin",
579
  "language_model.model.layers.48.self_attn.q_proj.weight": "pytorch_model-00003-of-00004.bin",
580
  "language_model.model.layers.48.self_attn.v_proj.weight": "pytorch_model-00003-of-00004.bin",
581
+ "language_model.model.layers.49.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
582
+ "language_model.model.layers.49.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
583
+ "language_model.model.layers.49.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
584
+ "language_model.model.layers.49.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
585
+ "language_model.model.layers.49.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
586
+ "language_model.model.layers.49.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
587
+ "language_model.model.layers.49.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
588
+ "language_model.model.layers.49.self_attn.k_norm.weight": "pytorch_model-00004-of-00004.bin",
589
+ "language_model.model.layers.49.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
590
+ "language_model.model.layers.49.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
591
+ "language_model.model.layers.49.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
592
+ "language_model.model.layers.49.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
593
+ "language_model.model.layers.49.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
594
  "language_model.model.layers.5.input_layernorm.weight": "pytorch_model-00001-of-00004.bin",
595
  "language_model.model.layers.5.mlp.down_proj.weight": "pytorch_model-00001-of-00004.bin",
596
  "language_model.model.layers.5.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
 
604
  "language_model.model.layers.5.self_attn.q_norm.weight": "pytorch_model-00001-of-00004.bin",
605
  "language_model.model.layers.5.self_attn.q_proj.weight": "pytorch_model-00001-of-00004.bin",
606
  "language_model.model.layers.5.self_attn.v_proj.weight": "pytorch_model-00001-of-00004.bin",
607
+ "language_model.model.layers.50.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
608
+ "language_model.model.layers.50.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
609
+ "language_model.model.layers.50.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
610
+ "language_model.model.layers.50.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
611
+ "language_model.model.layers.50.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
612
+ "language_model.model.layers.50.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
613
+ "language_model.model.layers.50.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
614
+ "language_model.model.layers.50.self_attn.k_norm.weight": "pytorch_model-00004-of-00004.bin",
615
+ "language_model.model.layers.50.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
616
+ "language_model.model.layers.50.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
617
+ "language_model.model.layers.50.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
618
+ "language_model.model.layers.50.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
619
+ "language_model.model.layers.50.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
620
+ "language_model.model.layers.51.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
621
+ "language_model.model.layers.51.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
622
+ "language_model.model.layers.51.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
623
+ "language_model.model.layers.51.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
624
+ "language_model.model.layers.51.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
625
+ "language_model.model.layers.51.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
626
+ "language_model.model.layers.51.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
627
+ "language_model.model.layers.51.self_attn.k_norm.weight": "pytorch_model-00004-of-00004.bin",
628
+ "language_model.model.layers.51.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
629
+ "language_model.model.layers.51.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
630
+ "language_model.model.layers.51.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
631
+ "language_model.model.layers.51.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
632
+ "language_model.model.layers.51.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
633
+ "language_model.model.layers.52.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
634
+ "language_model.model.layers.52.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
635
+ "language_model.model.layers.52.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
636
+ "language_model.model.layers.52.mlp.up_proj.weight": "pytorch_model-00004-of-00004.bin",
637
+ "language_model.model.layers.52.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
638
+ "language_model.model.layers.52.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
639
+ "language_model.model.layers.52.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
640
+ "language_model.model.layers.52.self_attn.k_norm.weight": "pytorch_model-00004-of-00004.bin",
641
+ "language_model.model.layers.52.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
642
+ "language_model.model.layers.52.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
643
+ "language_model.model.layers.52.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
644
+ "language_model.model.layers.52.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
645
+ "language_model.model.layers.52.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
646
  "language_model.model.layers.53.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
647
  "language_model.model.layers.53.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
648
  "language_model.model.layers.53.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
 
650
  "language_model.model.layers.53.post_attention_layernorm.weight": "pytorch_model-00004-of-00004.bin",
651
  "language_model.model.layers.53.post_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
652
  "language_model.model.layers.53.pre_feedforward_layernorm.weight": "pytorch_model-00004-of-00004.bin",
653
+ "language_model.model.layers.53.self_attn.k_norm.weight": "pytorch_model-00004-of-00004.bin",
654
+ "language_model.model.layers.53.self_attn.k_proj.weight": "pytorch_model-00004-of-00004.bin",
655
+ "language_model.model.layers.53.self_attn.o_proj.weight": "pytorch_model-00004-of-00004.bin",
656
+ "language_model.model.layers.53.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
657
+ "language_model.model.layers.53.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
658
+ "language_model.model.layers.53.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
659
  "language_model.model.layers.54.input_layernorm.weight": "pytorch_model-00004-of-00004.bin",
660
  "language_model.model.layers.54.mlp.down_proj.weight": "pytorch_model-00004-of-00004.bin",
661
  "language_model.model.layers.54.mlp.gate_proj.weight": "pytorch_model-00004-of-00004.bin",
 
734
  "language_model.model.layers.59.self_attn.q_norm.weight": "pytorch_model-00004-of-00004.bin",
735
  "language_model.model.layers.59.self_attn.q_proj.weight": "pytorch_model-00004-of-00004.bin",
736
  "language_model.model.layers.59.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
737
+ "language_model.model.layers.6.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
738
+ "language_model.model.layers.6.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
739
  "language_model.model.layers.6.mlp.gate_proj.weight": "pytorch_model-00001-of-00004.bin",
740
  "language_model.model.layers.6.mlp.up_proj.weight": "pytorch_model-00001-of-00004.bin",
741
+ "language_model.model.layers.6.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
742
+ "language_model.model.layers.6.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
743
+ "language_model.model.layers.6.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
744
  "language_model.model.layers.6.self_attn.k_norm.weight": "pytorch_model-00001-of-00004.bin",
745
  "language_model.model.layers.6.self_attn.k_proj.weight": "pytorch_model-00001-of-00004.bin",
746
  "language_model.model.layers.6.self_attn.o_proj.weight": "pytorch_model-00001-of-00004.bin",
 
775
  "language_model.model.layers.61.self_attn.v_proj.weight": "pytorch_model-00004-of-00004.bin",
776
  "language_model.model.layers.7.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
777
  "language_model.model.layers.7.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
778
+ "language_model.model.layers.7.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",
779
+ "language_model.model.layers.7.mlp.up_proj.weight": "pytorch_model-00002-of-00004.bin",
780
  "language_model.model.layers.7.post_attention_layernorm.weight": "pytorch_model-00002-of-00004.bin",
781
  "language_model.model.layers.7.post_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
782
  "language_model.model.layers.7.pre_feedforward_layernorm.weight": "pytorch_model-00002-of-00004.bin",
783
+ "language_model.model.layers.7.self_attn.k_norm.weight": "pytorch_model-00002-of-00004.bin",
784
+ "language_model.model.layers.7.self_attn.k_proj.weight": "pytorch_model-00002-of-00004.bin",
785
+ "language_model.model.layers.7.self_attn.o_proj.weight": "pytorch_model-00002-of-00004.bin",
786
+ "language_model.model.layers.7.self_attn.q_norm.weight": "pytorch_model-00002-of-00004.bin",
787
+ "language_model.model.layers.7.self_attn.q_proj.weight": "pytorch_model-00002-of-00004.bin",
788
+ "language_model.model.layers.7.self_attn.v_proj.weight": "pytorch_model-00002-of-00004.bin",
789
  "language_model.model.layers.8.input_layernorm.weight": "pytorch_model-00002-of-00004.bin",
790
  "language_model.model.layers.8.mlp.down_proj.weight": "pytorch_model-00002-of-00004.bin",
791
  "language_model.model.layers.8.mlp.gate_proj.weight": "pytorch_model-00002-of-00004.bin",