Tiny dummy models
Collection
Randomly initialized tiny models for debugging/testing purpose • 167 items • Updated • 6
This tiny model is intended for debugging. It is randomly initialized using the configuration adapted from mistralai/Mistral-Small-4-119B-2603.
| File path | Size |
|---|---|
| model.safetensors | 11.8MB |
import torch
from transformers import AutoProcessor, Mistral3ForConditionalGeneration
# Load model and tokenizer
model_id = "yujiepan/mistral-small-4-tiny-random"
model = Mistral3ForConditionalGeneration.from_pretrained(
model_id,
device_map="auto",
torch_dtype="bfloat16",
trust_remote_code=True,
)
processor = AutoProcessor.from_pretrained(model_id)
image_url = "https://static.wikia.nocookie.net/essentialsdocs/images/7/70/Battle.png/revision/latest?cb=20220523172438"
messages = [
{
"role": "user",
"content": [
{
"type": "text",
"text": "What is this?",
},
{"type": "image_url", "image_url": {"url": image_url}},
],
},
]
inputs = processor.apply_chat_template(
messages,
return_tensors="pt",
tokenize=True,
return_dict=True,
reasoning_effort="high",
)
inputs = inputs.to(model.device)
output = model.generate(
**inputs,
max_new_tokens=32,
do_sample=True,
temperature=0.7,
)[0]
decoded_output = processor.decode(output, skip_special_tokens=False).replace(
"[IMG]", "I"
)
print(decoded_output)
import json
from pathlib import Path
import accelerate
import torch
from huggingface_hub import file_exists, hf_hub_download
from transformers import (
AutoConfig,
AutoModelForCausalLM,
AutoProcessor,
GenerationConfig,
Mistral3ForConditionalGeneration,
MistralCommonBackend,
set_seed,
)
source_model_id = "mistralai/Mistral-Small-4-119B-2603"
save_folder = "/tmp/yujiepan/mistral-small-4-tiny-random"
processor = AutoProcessor.from_pretrained(source_model_id, trust_remote_code=True)
processor.save_pretrained(save_folder)
processor = MistralCommonBackend.from_pretrained(
source_model_id, trust_remote_code=True
)
processor.save_pretrained(save_folder)
with open(
hf_hub_download(source_model_id, filename="config.json", repo_type="model"),
"r",
encoding="utf-8",
) as f:
config_json = json.load(f)
config_json["text_config"].update(
{
"hidden_size": 8,
"intermediate_size": 32,
"moe_intermediate_size": 32,
"num_hidden_layers": 2,
"q_lora_rank": 32,
}
)
# config_json['tie_word_embeddings'] = True
config_json["vision_config"].update(
{
"head_dim": 32,
"hidden_size": 64,
"intermediate_size": 64,
"num_attention_heads": 2,
"num_hidden_layers": 2,
}
)
del config_json["quantization_config"]
with open(f"{save_folder}/config.json", "w", encoding="utf-8") as f:
json.dump(config_json, f, indent=2)
config = AutoConfig.from_pretrained(
save_folder,
trust_remote_code=True,
)
print(config)
torch.set_default_dtype(torch.bfloat16)
model = Mistral3ForConditionalGeneration(config)
torch.set_default_dtype(torch.float32)
if file_exists(
filename="generation_config.json", repo_id=source_model_id, repo_type="model"
):
model.generation_config = GenerationConfig.from_pretrained(
source_model_id,
trust_remote_code=True,
)
model.generation_config.do_sample = True
print(model.generation_config)
model = model.cpu()
with torch.no_grad():
for name, p in sorted(model.named_parameters()):
torch.nn.init.normal_(p, 0, 0.2)
print(name, p.shape)
model.save_pretrained(save_folder)
print(model)
Mistral3ForConditionalGeneration(
(model): Mistral3Model(
(vision_tower): PixtralVisionModel(
(patch_conv): Conv2d(3, 64, kernel_size=(14, 14), stride=(14, 14), bias=False)
(ln_pre): PixtralRMSNorm((64,), eps=1e-05)
(transformer): PixtralTransformer(
(layers): ModuleList(
(0-1): 2 x PixtralAttentionLayer(
(attention_norm): PixtralRMSNorm((64,), eps=1e-05)
(feed_forward): PixtralMLP(
(gate_proj): Linear(in_features=64, out_features=64, bias=False)
(up_proj): Linear(in_features=64, out_features=64, bias=False)
(down_proj): Linear(in_features=64, out_features=64, bias=False)
(act_fn): SiLUActivation()
)
(attention): PixtralAttention(
(k_proj): Linear(in_features=64, out_features=64, bias=False)
(v_proj): Linear(in_features=64, out_features=64, bias=False)
(q_proj): Linear(in_features=64, out_features=64, bias=False)
(o_proj): Linear(in_features=64, out_features=64, bias=False)
)
(ffn_norm): PixtralRMSNorm((64,), eps=1e-05)
)
)
)
(patch_positional_embedding): PixtralRotaryEmbedding()
)
(multi_modal_projector): Mistral3MultiModalProjector(
(norm): Mistral3RMSNorm((64,), eps=1e-06)
(patch_merger): Mistral3PatchMerger(
(merging_layer): Linear(in_features=256, out_features=64, bias=False)
)
(linear_1): Linear(in_features=64, out_features=8, bias=False)
(act): GELUActivation()
(linear_2): Linear(in_features=8, out_features=8, bias=False)
)
(language_model): Mistral4Model(
(embed_tokens): Embedding(131072, 8, padding_idx=11)
(layers): ModuleList(
(0-1): 2 x Mistral4DecoderLayer(
(self_attn): Mistral4Attention(
(q_a_proj): Linear(in_features=8, out_features=32, bias=False)
(q_a_layernorm): Mistral4RMSNorm((32,), eps=1e-06)
(q_b_proj): Linear(in_features=32, out_features=4096, bias=False)
(kv_a_proj_with_mqa): Linear(in_features=8, out_features=320, bias=False)
(kv_a_layernorm): Mistral4RMSNorm((256,), eps=1e-06)
(kv_b_proj): Linear(in_features=256, out_features=6144, bias=False)
(o_proj): Linear(in_features=4096, out_features=8, bias=False)
)
(mlp): Mistral4MoE(
(experts): Mistral4NaiveMoe(
(act_fn): SiLUActivation()
)
(gate): Mistral4TopkRouter()
(shared_experts): Mistral4MLP(
(gate_proj): Linear(in_features=8, out_features=32, bias=False)
(up_proj): Linear(in_features=8, out_features=32, bias=False)
(down_proj): Linear(in_features=32, out_features=8, bias=False)
(act_fn): SiLUActivation()
)
)
(input_layernorm): Mistral4RMSNorm((8,), eps=1e-06)
(post_attention_layernorm): Mistral4RMSNorm((8,), eps=1e-06)
)
)
(norm): Mistral4RMSNorm((8,), eps=1e-06)
(rotary_emb): Mistral4RotaryEmbedding()
)
)
(lm_head): Linear(in_features=8, out_features=131072, bias=False)
)
Base model
mistralai/Mistral-Small-4-119B-2603