Commit ·
c4b369c
0
Parent(s):
🚀 Initial full rebuild of Humigence CLI (v1 UX + v2 Engine)
Browse files- README.md +291 -0
- __init__.py +1 -0
- cli/__init__.py +1 -0
- cli/__pycache__/__init__.cpython-310.pyc +0 -0
- cli/__pycache__/fine_tune.cpython-310.pyc +0 -0
- cli/__pycache__/humigence_audit.cpython-310.pyc +0 -0
- cli/__pycache__/main.cpython-310.pyc +0 -0
- cli/fine_tune.py +269 -0
- cli/humigence_audit.py +46 -0
- cli/main.py +33 -0
- config/default_config.json +0 -0
- pipelines/__pycache__/lora_trainer.cpython-310.pyc +0 -0
- pipelines/lora_trainer.py +277 -0
- pyproject.toml +34 -0
- runs/humigence/ACCEPTED.txt +1 -0
- runs/humigence/config.snapshot.json +13 -0
- runs/humigence/eval_prompts.jsonl +5 -0
- runs/humigence/eval_results.jsonl +5 -0
- runs/humigence/reproduce.sh +3 -0
- runs/humigence/run_summary.json +12 -0
- templates/accelerate_config.yaml +0 -0
- utils/__pycache__/device.cpython-310.pyc +0 -0
- utils/__pycache__/validators.cpython-310.pyc +0 -0
- utils/device.py +29 -0
- utils/tokenizer.py +0 -0
- utils/validators.py +23 -0
README.md
ADDED
|
@@ -0,0 +1,291 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🧠 Humigence CLI
|
| 2 |
+
|
| 3 |
+
**Your AI. Your pipeline. Zero code.**
|
| 4 |
+
|
| 5 |
+
A complete MLOps suite built for makers, teams, and enterprises. Humigence provides zero-config, GPU-aware fine-tuning with surgical precision and complete reproducibility.
|
| 6 |
+
|
| 7 |
+
## ✨ Features
|
| 8 |
+
|
| 9 |
+
- 🎯 **Zero-Config Wizard**: Interactive setup with Basic/Advanced modes
|
| 10 |
+
- 🖥️ **Hardware Detection**: Automatic GPU, CPU, and memory detection
|
| 11 |
+
- 🧪 **Training Recipes**: QLoRA, LoRA (FP16/BF16), Full Fine-tuning
|
| 12 |
+
- 📊 **Smart Batching**: Auto-fit micro-batch size to available VRAM
|
| 13 |
+
- 🔄 **Complete Reproducibility**: Config snapshots and reproduce scripts
|
| 14 |
+
- 📈 **Evaluation & Acceptance**: Curated prompts and quality gates
|
| 15 |
+
- 📦 **Artifact Export**: Structured outputs with run summaries
|
| 16 |
+
|
| 17 |
+
## 🚀 Quick Start
|
| 18 |
+
|
| 19 |
+
### Installation
|
| 20 |
+
|
| 21 |
+
```bash
|
| 22 |
+
# Clone the repository
|
| 23 |
+
git clone https://github.com/your-username/humigence.git
|
| 24 |
+
cd humigence
|
| 25 |
+
|
| 26 |
+
# Install dependencies
|
| 27 |
+
pip install -e .
|
| 28 |
+
|
| 29 |
+
# Set up CLI alias (optional)
|
| 30 |
+
echo "alias humigence='python3 ~/humigence/cli/main.py'" >> ~/.bashrc
|
| 31 |
+
source ~/.bashrc
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
### Basic Usage
|
| 35 |
+
|
| 36 |
+
```bash
|
| 37 |
+
# Launch the interactive wizard
|
| 38 |
+
humigence
|
| 39 |
+
|
| 40 |
+
# Or run directly
|
| 41 |
+
python3 -m cli.main
|
| 42 |
+
|
| 43 |
+
# Run training from config
|
| 44 |
+
python3 -m pipelines.lora_trainer runs/humigence/config.snapshot.json
|
| 45 |
+
|
| 46 |
+
# Audit a training run
|
| 47 |
+
python3 -m cli.humigence_audit
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
## 🎯 Training Workflow
|
| 51 |
+
|
| 52 |
+
### 1. Interactive Setup
|
| 53 |
+
|
| 54 |
+
The Humigence wizard guides you through:
|
| 55 |
+
|
| 56 |
+
- **Setup Mode**: Basic (essential config) or Advanced (full control)
|
| 57 |
+
- **Hardware Detection**: Automatic GPU, CPU, and memory detection
|
| 58 |
+
- **Model Selection**: HuggingFace cache scanning + manual entry
|
| 59 |
+
- **Dataset Loading**: Auto-detection from `~/humigence_data/`
|
| 60 |
+
- **Training Recipe**: QLoRA, LoRA, or Full Fine-tuning
|
| 61 |
+
- **Hyperparameters**: Learning rate, epochs, batch size, etc.
|
| 62 |
+
|
| 63 |
+
### 2. Training Execution
|
| 64 |
+
|
| 65 |
+
```bash
|
| 66 |
+
🚀 Humigence Trainer Starting...
|
| 67 |
+
✅ Configuration Loaded: [all settings]
|
| 68 |
+
📦 Estimated Micro-batch Size: 4
|
| 69 |
+
⚠️ Loading model without quantization (RTX 5090 compatibility)
|
| 70 |
+
✅ Model + Tokenizer Loaded: Qwen/Qwen1.5-0.5B
|
| 71 |
+
✅ LoRA adapters applied
|
| 72 |
+
📚 Loading dataset...
|
| 73 |
+
✅ Dataset loaded: 10 samples
|
| 74 |
+
🚀 Starting training...
|
| 75 |
+
✅ Training complete — adapters saved.
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
### 3. Evaluation & Acceptance
|
| 79 |
+
|
| 80 |
+
- **Curated Prompts**: 5 diverse evaluation questions
|
| 81 |
+
- **Model Inference**: Generation with temperature and sampling
|
| 82 |
+
- **Acceptance Criteria**: Loss threshold (< 0.8) and eval count (≥ 1)
|
| 83 |
+
- **Status Markers**: ACCEPTED.txt or REJECTED.txt files
|
| 84 |
+
|
| 85 |
+
### 4. Artifact Export
|
| 86 |
+
|
| 87 |
+
```
|
| 88 |
+
runs/humigence/
|
| 89 |
+
├── adapters/ # LoRA adapter weights
|
| 90 |
+
├── tokenizer/ # Tokenizer used
|
| 91 |
+
├── config.snapshot.json # Training config
|
| 92 |
+
├── reproduce.sh # Rerun script
|
| 93 |
+
├── ACCEPTED.txt / REJECTED.txt
|
| 94 |
+
├── eval_results.jsonl # Evaluation prompt outputs
|
| 95 |
+
├── run_summary.json # Structured run summary
|
| 96 |
+
└── artifacts.zip # Complete export archive
|
| 97 |
+
```
|
| 98 |
+
|
| 99 |
+
## 🔧 Configuration
|
| 100 |
+
|
| 101 |
+
### Basic Mode (Recommended)
|
| 102 |
+
|
| 103 |
+
Essential configuration with sensible defaults:
|
| 104 |
+
|
| 105 |
+
- **Learning Rate**: 2e-5
|
| 106 |
+
- **Epochs**: 3
|
| 107 |
+
- **Gradient Accumulation**: 4
|
| 108 |
+
- **Logging Steps**: 10
|
| 109 |
+
- **Save Steps**: 100
|
| 110 |
+
|
| 111 |
+
### Advanced Mode
|
| 112 |
+
|
| 113 |
+
Full control over all parameters:
|
| 114 |
+
|
| 115 |
+
- Gradient accumulation steps
|
| 116 |
+
- Learning rate
|
| 117 |
+
- Evaluation strategy
|
| 118 |
+
- Save steps
|
| 119 |
+
- Warmup steps
|
| 120 |
+
- Number of training epochs
|
| 121 |
+
- Logging steps
|
| 122 |
+
- Random seed
|
| 123 |
+
|
| 124 |
+
## 📊 Supported Models
|
| 125 |
+
|
| 126 |
+
- **Qwen/Qwen1.5-0.5B**: 77M parameters
|
| 127 |
+
- **microsoft/Phi-2**: 839M parameters
|
| 128 |
+
- **TinyLlama/TinyLlama-1.1B-Chat-v1.0**: 369M parameters
|
| 129 |
+
- **Custom Models**: HuggingFace repos or local paths
|
| 130 |
+
|
| 131 |
+
## 🗂️ Dataset Support
|
| 132 |
+
|
| 133 |
+
- **OpenAssistant Format**: Automatic conversation pairing
|
| 134 |
+
- **Instruction-Response**: Standard format support
|
| 135 |
+
- **JSONL Files**: Line-by-line JSON processing
|
| 136 |
+
- **Auto-Detection**: Scans `~/humigence_data/` directory
|
| 137 |
+
|
| 138 |
+
## 🖥️ Hardware Requirements
|
| 139 |
+
|
| 140 |
+
- **GPU**: NVIDIA GPU with CUDA support (RTX 5090 compatible)
|
| 141 |
+
- **RAM**: 8GB+ recommended
|
| 142 |
+
- **Storage**: 10GB+ for models and datasets
|
| 143 |
+
- **Python**: 3.8+ with PyTorch
|
| 144 |
+
|
| 145 |
+
## 📁 Project Structure
|
| 146 |
+
|
| 147 |
+
```
|
| 148 |
+
humigence/
|
| 149 |
+
├── cli/
|
| 150 |
+
│ ├── main.py # CLI entry point
|
| 151 |
+
│ ├── fine_tune.py # Interactive wizard
|
| 152 |
+
│ └── humigence_audit.py # Run inspector
|
| 153 |
+
├── config/
|
| 154 |
+
│ └── default_config.json # Fallback defaults
|
| 155 |
+
├── pipelines/
|
| 156 |
+
│ └── lora_trainer.py # Training engine
|
| 157 |
+
├── templates/
|
| 158 |
+
│ └── accelerate_config.yaml
|
| 159 |
+
├── utils/
|
| 160 |
+
│ ├── device.py # Hardware detection
|
| 161 |
+
│ ├── tokenizer.py # Tokenizer utilities
|
| 162 |
+
│ └── validators.py # Dataset validation
|
| 163 |
+
└── runs/
|
| 164 |
+
└── <run_name>/
|
| 165 |
+
├── config.snapshot.json
|
| 166 |
+
├── reproduce.sh
|
| 167 |
+
├── adapters/
|
| 168 |
+
├── tokenizer/
|
| 169 |
+
└── artifacts.zip
|
| 170 |
+
```
|
| 171 |
+
|
| 172 |
+
## 🔄 Reproducibility
|
| 173 |
+
|
| 174 |
+
Every training run generates:
|
| 175 |
+
|
| 176 |
+
- **Config Snapshot**: Complete configuration in JSON
|
| 177 |
+
- **Reproduce Script**: One-click rerun capability
|
| 178 |
+
- **Artifact Archive**: Complete export of all outputs
|
| 179 |
+
- **Run Summary**: Structured metadata for tracking
|
| 180 |
+
|
| 181 |
+
```bash
|
| 182 |
+
# Rerun any training
|
| 183 |
+
./runs/humigence/reproduce.sh
|
| 184 |
+
|
| 185 |
+
# Or use the config directly
|
| 186 |
+
python3 -m pipelines.lora_trainer runs/humigence/config.snapshot.json
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## 🧪 Evaluation
|
| 190 |
+
|
| 191 |
+
### Curated Prompts
|
| 192 |
+
|
| 193 |
+
Default evaluation questions:
|
| 194 |
+
|
| 195 |
+
1. "What is the capital of France?"
|
| 196 |
+
2. "Explain quantum computing in simple terms."
|
| 197 |
+
3. "Write a short poem about artificial intelligence."
|
| 198 |
+
4. "How do you make a good cup of coffee?"
|
| 199 |
+
5. "What are the benefits of renewable energy?"
|
| 200 |
+
|
| 201 |
+
### Custom Evaluation
|
| 202 |
+
|
| 203 |
+
Create `runs/humigence/eval_prompts.jsonl`:
|
| 204 |
+
|
| 205 |
+
```json
|
| 206 |
+
{"instruction": "Your custom prompt here"}
|
| 207 |
+
{"instruction": "Another evaluation question"}
|
| 208 |
+
```
|
| 209 |
+
|
| 210 |
+
## 📈 Monitoring
|
| 211 |
+
|
| 212 |
+
### Run Audit
|
| 213 |
+
|
| 214 |
+
Inspect any training run:
|
| 215 |
+
|
| 216 |
+
```bash
|
| 217 |
+
python3 -m cli.humigence_audit
|
| 218 |
+
```
|
| 219 |
+
|
| 220 |
+
Shows:
|
| 221 |
+
- Training configuration
|
| 222 |
+
- Run status (ACCEPTED/REJECTED)
|
| 223 |
+
- Final metrics
|
| 224 |
+
- Evaluation results
|
| 225 |
+
|
| 226 |
+
### Run Summary
|
| 227 |
+
|
| 228 |
+
Structured JSON output:
|
| 229 |
+
|
| 230 |
+
```json
|
| 231 |
+
{
|
| 232 |
+
"run_id": "2025-09-17T22:50:18.668019",
|
| 233 |
+
"status": "accepted",
|
| 234 |
+
"model": "Qwen/Qwen1.5-0.5B",
|
| 235 |
+
"dataset": "/path/to/dataset.jsonl",
|
| 236 |
+
"recipe": "QLoRA (4-bit NF4)",
|
| 237 |
+
"epochs": "3",
|
| 238 |
+
"learning_rate": "2e-5",
|
| 239 |
+
"final_loss": 0.65,
|
| 240 |
+
"eval_prompt_count": 5,
|
| 241 |
+
"timestamp": "2025-09-17 23:31:01"
|
| 242 |
+
}
|
| 243 |
+
```
|
| 244 |
+
|
| 245 |
+
## 🛠️ Development
|
| 246 |
+
|
| 247 |
+
### Dependencies
|
| 248 |
+
|
| 249 |
+
- `typer`: CLI framework
|
| 250 |
+
- `rich`: Terminal formatting
|
| 251 |
+
- `inquirerpy`: Interactive prompts
|
| 252 |
+
- `transformers`: HuggingFace models
|
| 253 |
+
- `peft`: Parameter-efficient fine-tuning
|
| 254 |
+
- `bitsandbytes`: Quantization
|
| 255 |
+
- `accelerate`: Multi-GPU training
|
| 256 |
+
- `datasets`: Dataset handling
|
| 257 |
+
- `psutil`: System monitoring
|
| 258 |
+
|
| 259 |
+
### Installation
|
| 260 |
+
|
| 261 |
+
```bash
|
| 262 |
+
# Install in development mode
|
| 263 |
+
pip install -e .
|
| 264 |
+
|
| 265 |
+
# Or install dependencies manually
|
| 266 |
+
pip install typer rich inquirerpy transformers peft bitsandbytes accelerate datasets psutil
|
| 267 |
+
```
|
| 268 |
+
|
| 269 |
+
## 🤝 Contributing
|
| 270 |
+
|
| 271 |
+
1. Fork the repository
|
| 272 |
+
2. Create a feature branch
|
| 273 |
+
3. Make your changes
|
| 274 |
+
4. Add tests if applicable
|
| 275 |
+
5. Submit a pull request
|
| 276 |
+
|
| 277 |
+
## 📄 License
|
| 278 |
+
|
| 279 |
+
MIT License - see LICENSE file for details
|
| 280 |
+
|
| 281 |
+
## 🙏 Acknowledgments
|
| 282 |
+
|
| 283 |
+
- HuggingFace for the transformers library
|
| 284 |
+
- Microsoft for PEFT and LoRA implementations
|
| 285 |
+
- The open-source ML community
|
| 286 |
+
|
| 287 |
+
---
|
| 288 |
+
|
| 289 |
+
**Built with ❤️ for the AI community**
|
| 290 |
+
|
| 291 |
+
*Humigence — Your AI. Your pipeline. Zero code.*
|
__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Humigence package."""
|
cli/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Humigence CLI package."""
|
cli/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (165 Bytes). View file
|
|
|
cli/__pycache__/fine_tune.cpython-310.pyc
ADDED
|
Binary file (5.76 kB). View file
|
|
|
cli/__pycache__/humigence_audit.cpython-310.pyc
ADDED
|
Binary file (1.41 kB). View file
|
|
|
cli/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (1.08 kB). View file
|
|
|
cli/fine_tune.py
ADDED
|
@@ -0,0 +1,269 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from InquirerPy import prompt
|
| 2 |
+
from rich.console import Console
|
| 3 |
+
from rich.table import Table
|
| 4 |
+
from utils.device import get_system_info
|
| 5 |
+
from utils.validators import detect_datasets
|
| 6 |
+
import os
|
| 7 |
+
import json
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import datetime
|
| 10 |
+
|
| 11 |
+
console = Console()
|
| 12 |
+
|
| 13 |
+
def display_system_summary():
|
| 14 |
+
info = get_system_info()
|
| 15 |
+
|
| 16 |
+
table = Table(title="🖥️ System Detection Summary", show_lines=True)
|
| 17 |
+
table.add_column("Property", style="cyan", no_wrap=True)
|
| 18 |
+
table.add_column("Value", style="green")
|
| 19 |
+
|
| 20 |
+
for key, val in info.items():
|
| 21 |
+
if key == "GPUs":
|
| 22 |
+
for i, gpu in enumerate(val):
|
| 23 |
+
table.add_row(f"GPU {i} Name", gpu['name'])
|
| 24 |
+
table.add_row(f"GPU {i} Memory", gpu['memory'])
|
| 25 |
+
else:
|
| 26 |
+
table.add_row(key, str(val))
|
| 27 |
+
|
| 28 |
+
console.print("\n")
|
| 29 |
+
console.print(table)
|
| 30 |
+
|
| 31 |
+
def get_available_models():
|
| 32 |
+
# Default Hugging Face cache path
|
| 33 |
+
hf_cache = os.path.expanduser("~/.cache/huggingface/hub/models--")
|
| 34 |
+
model_choices = []
|
| 35 |
+
|
| 36 |
+
if os.path.exists(hf_cache):
|
| 37 |
+
for root, dirs, files in os.walk(hf_cache):
|
| 38 |
+
for d in dirs:
|
| 39 |
+
if d.startswith("snapshots"):
|
| 40 |
+
model_dir = os.path.basename(os.path.dirname(root))
|
| 41 |
+
model_choices.append(model_dir.replace("models--", "").replace("--", "/"))
|
| 42 |
+
|
| 43 |
+
# Add manually defined models
|
| 44 |
+
model_choices += [
|
| 45 |
+
"TinyLlama/TinyLlama-1.1B-Chat-v1.0",
|
| 46 |
+
"microsoft/Phi-2",
|
| 47 |
+
"Qwen/Qwen1.5-0.5B",
|
| 48 |
+
"manual-entry (custom path/repo)"
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
# De-dupe and sort
|
| 52 |
+
return sorted(list(set(model_choices)))
|
| 53 |
+
|
| 54 |
+
def run():
|
| 55 |
+
console.print("\n[bold magenta]🧪 Supervised Fine-Tuning Setup[/bold magenta]")
|
| 56 |
+
|
| 57 |
+
questions = [
|
| 58 |
+
{
|
| 59 |
+
"type": "list",
|
| 60 |
+
"name": "setup_mode",
|
| 61 |
+
"message": "Choose Setup Mode:",
|
| 62 |
+
"choices": ["Basic Setup – Essential configuration only", "Advanced Setup – Full control over all parameters"],
|
| 63 |
+
}
|
| 64 |
+
]
|
| 65 |
+
|
| 66 |
+
answers = prompt(questions)
|
| 67 |
+
setup_mode = answers.get("setup_mode").split(" ")[0].lower() # 'basic' or 'advanced'
|
| 68 |
+
|
| 69 |
+
console.print(f"\n[green]✅ You selected:[/green] [yellow]{answers.get('setup_mode')}[/yellow]")
|
| 70 |
+
|
| 71 |
+
# Display system summary
|
| 72 |
+
display_system_summary()
|
| 73 |
+
|
| 74 |
+
# GPU selection
|
| 75 |
+
gpu_options = []
|
| 76 |
+
info = get_system_info()
|
| 77 |
+
for idx, gpu in enumerate(info['GPUs']):
|
| 78 |
+
gpu_options.append(f"Single GPU – GPU {idx}: {gpu['name']}")
|
| 79 |
+
|
| 80 |
+
if len(gpu_options) > 1:
|
| 81 |
+
gpu_options.append("Multi-GPU – All")
|
| 82 |
+
gpu_options.append("Multi-GPU – Custom")
|
| 83 |
+
|
| 84 |
+
gpu_question = [
|
| 85 |
+
{
|
| 86 |
+
"type": "list",
|
| 87 |
+
"name": "gpu_choice",
|
| 88 |
+
"message": "�� Choose Training Configuration:",
|
| 89 |
+
"choices": gpu_options,
|
| 90 |
+
}
|
| 91 |
+
]
|
| 92 |
+
gpu_answer = prompt(gpu_question)
|
| 93 |
+
selected_gpu = gpu_answer.get("gpu_choice")
|
| 94 |
+
|
| 95 |
+
console.print(f"\n[green]✅ You selected GPU config:[/green] [yellow]{selected_gpu}[/yellow]")
|
| 96 |
+
|
| 97 |
+
# Model selection
|
| 98 |
+
model_question = [
|
| 99 |
+
{
|
| 100 |
+
"type": "list",
|
| 101 |
+
"name": "base_model",
|
| 102 |
+
"message": "🧠 Choose Base Model:",
|
| 103 |
+
"choices": get_available_models()
|
| 104 |
+
}
|
| 105 |
+
]
|
| 106 |
+
|
| 107 |
+
model_answer = prompt(model_question)
|
| 108 |
+
selected_model = model_answer.get("base_model")
|
| 109 |
+
|
| 110 |
+
# If manual-entry selected
|
| 111 |
+
if selected_model == "manual-entry (custom path/repo)":
|
| 112 |
+
manual_input = prompt([
|
| 113 |
+
{
|
| 114 |
+
"type": "input",
|
| 115 |
+
"name": "custom_model",
|
| 116 |
+
"message": "Enter Hugging Face repo or local model path:"
|
| 117 |
+
}
|
| 118 |
+
])
|
| 119 |
+
selected_model = manual_input.get("custom_model")
|
| 120 |
+
|
| 121 |
+
console.print(f"\n[green]✅ You selected model:[/green] [yellow]{selected_model}[/yellow]")
|
| 122 |
+
|
| 123 |
+
# Dataset selection
|
| 124 |
+
dataset_options = detect_datasets()
|
| 125 |
+
if not dataset_options:
|
| 126 |
+
console.print("[bold red]⚠️ No datasets found in ~/humigence_data[/bold red]")
|
| 127 |
+
return
|
| 128 |
+
|
| 129 |
+
dataset_question = [
|
| 130 |
+
{
|
| 131 |
+
"type": "list",
|
| 132 |
+
"name": "dataset_path",
|
| 133 |
+
"message": "📚 Choose Dataset to Train On:",
|
| 134 |
+
"choices": [opt[0] for opt in dataset_options]
|
| 135 |
+
}
|
| 136 |
+
]
|
| 137 |
+
|
| 138 |
+
dataset_answer = prompt(dataset_question)
|
| 139 |
+
selected_dataset = [
|
| 140 |
+
path for name, path in dataset_options if name == dataset_answer["dataset_path"]
|
| 141 |
+
][0]
|
| 142 |
+
|
| 143 |
+
console.print(f"\n[green]✅ You selected dataset:[/green] [yellow]{selected_dataset}[/yellow]")
|
| 144 |
+
|
| 145 |
+
# Training recipe selection
|
| 146 |
+
recipe_question = [
|
| 147 |
+
{
|
| 148 |
+
"type": "list",
|
| 149 |
+
"name": "recipe",
|
| 150 |
+
"message": "🧪 Choose Training Recipe:",
|
| 151 |
+
"choices": [
|
| 152 |
+
"QLoRA (4-bit NF4)",
|
| 153 |
+
"LoRA (FP16)",
|
| 154 |
+
"LoRA (BF16)",
|
| 155 |
+
"Full Fine-tuning (FP32)"
|
| 156 |
+
],
|
| 157 |
+
}
|
| 158 |
+
]
|
| 159 |
+
|
| 160 |
+
recipe_answer = prompt(recipe_question)
|
| 161 |
+
selected_recipe = recipe_answer.get("recipe")
|
| 162 |
+
|
| 163 |
+
console.print(f"\n[green]✅ Training recipe:[/green] [yellow]{selected_recipe}[/yellow]")
|
| 164 |
+
|
| 165 |
+
# Parameter branching - Basic vs Advanced
|
| 166 |
+
if setup_mode == "advanced":
|
| 167 |
+
param_questions = [
|
| 168 |
+
{
|
| 169 |
+
"type": "input",
|
| 170 |
+
"name": "learning_rate",
|
| 171 |
+
"message": "Enter Learning Rate:",
|
| 172 |
+
"default": "2e-5"
|
| 173 |
+
},
|
| 174 |
+
{
|
| 175 |
+
"type": "input",
|
| 176 |
+
"name": "num_train_epochs",
|
| 177 |
+
"message": "Enter Number of Epochs:",
|
| 178 |
+
"default": "3"
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"type": "input",
|
| 182 |
+
"name": "gradient_accumulation_steps",
|
| 183 |
+
"message": "Enter Gradient Accumulation Steps:",
|
| 184 |
+
"default": "4"
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"type": "input",
|
| 188 |
+
"name": "logging_steps",
|
| 189 |
+
"message": "Enter Logging Steps:",
|
| 190 |
+
"default": "10"
|
| 191 |
+
},
|
| 192 |
+
{
|
| 193 |
+
"type": "input",
|
| 194 |
+
"name": "save_steps",
|
| 195 |
+
"message": "Enter Save Steps:",
|
| 196 |
+
"default": "100"
|
| 197 |
+
}
|
| 198 |
+
]
|
| 199 |
+
|
| 200 |
+
param_answers = prompt(param_questions)
|
| 201 |
+
else:
|
| 202 |
+
# Basic mode defaults
|
| 203 |
+
param_answers = {
|
| 204 |
+
"learning_rate": "2e-5",
|
| 205 |
+
"num_train_epochs": "3",
|
| 206 |
+
"gradient_accumulation_steps": "4",
|
| 207 |
+
"logging_steps": "10",
|
| 208 |
+
"save_steps": "100"
|
| 209 |
+
}
|
| 210 |
+
|
| 211 |
+
console.print(f"\n[cyan]📦 Hyperparameters Loaded:[/cyan]")
|
| 212 |
+
for k, v in param_answers.items():
|
| 213 |
+
console.print(f"[bold]{k}[/bold]: {v}")
|
| 214 |
+
|
| 215 |
+
# Combine config
|
| 216 |
+
final_config = {
|
| 217 |
+
"setup_mode": setup_mode,
|
| 218 |
+
"gpu_config": selected_gpu,
|
| 219 |
+
"base_model": selected_model,
|
| 220 |
+
"dataset_path": selected_dataset,
|
| 221 |
+
"training_recipe": selected_recipe,
|
| 222 |
+
**param_answers,
|
| 223 |
+
"timestamp": datetime.datetime.now().isoformat()
|
| 224 |
+
}
|
| 225 |
+
|
| 226 |
+
# Create directory and write config snapshot
|
| 227 |
+
run_dir = Path("runs/humigence")
|
| 228 |
+
run_dir.mkdir(parents=True, exist_ok=True)
|
| 229 |
+
snapshot_path = run_dir / "config.snapshot.json"
|
| 230 |
+
|
| 231 |
+
with open(snapshot_path, "w") as f:
|
| 232 |
+
json.dump(final_config, f, indent=2)
|
| 233 |
+
|
| 234 |
+
console.print(f"\n[bold green]✅ Configuration saved to:[/bold green] [cyan]{snapshot_path}[/cyan]")
|
| 235 |
+
|
| 236 |
+
# Generate reproduce.sh script
|
| 237 |
+
reproduce_script = f"""#!/bin/bash
|
| 238 |
+
# Re-run this exact training config
|
| 239 |
+
python3 -m pipelines.lora_trainer --config {snapshot_path}
|
| 240 |
+
"""
|
| 241 |
+
|
| 242 |
+
reproduce_path = run_dir / "reproduce.sh"
|
| 243 |
+
with open(reproduce_path, "w") as f:
|
| 244 |
+
f.write(reproduce_script)
|
| 245 |
+
|
| 246 |
+
# Make executable
|
| 247 |
+
reproduce_path.chmod(0o755)
|
| 248 |
+
|
| 249 |
+
console.print(f"[bold green]✅ Reproduction script saved to:[/bold green] [cyan]{reproduce_path}[/cyan]")
|
| 250 |
+
|
| 251 |
+
# Final confirmation prompt
|
| 252 |
+
final_prompt = prompt([
|
| 253 |
+
{
|
| 254 |
+
"type": "confirm",
|
| 255 |
+
"name": "confirm_training",
|
| 256 |
+
"message": "🚀 Proceed with training now?",
|
| 257 |
+
"default": True
|
| 258 |
+
}
|
| 259 |
+
])
|
| 260 |
+
|
| 261 |
+
if not final_prompt["confirm_training"]:
|
| 262 |
+
console.print("[bold yellow]❌ Training cancelled.[/bold yellow]")
|
| 263 |
+
return
|
| 264 |
+
else:
|
| 265 |
+
console.print("[bold green]�� Starting training...[/bold green]")
|
| 266 |
+
# Call training engine next (Step 13)
|
| 267 |
+
|
| 268 |
+
if __name__ == "__main__":
|
| 269 |
+
run()
|
cli/humigence_audit.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# humigence_audit.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
from rich.console import Console
|
| 6 |
+
from rich.table import Table
|
| 7 |
+
|
| 8 |
+
console = Console()
|
| 9 |
+
|
| 10 |
+
def audit_run(run_dir="runs/humigence"):
|
| 11 |
+
config_path = Path(run_dir) / "config.snapshot.json"
|
| 12 |
+
summary_path = Path(run_dir) / "run_summary.json"
|
| 13 |
+
status = "❌ Not found"
|
| 14 |
+
|
| 15 |
+
if Path(run_dir, "ACCEPTED.txt").exists():
|
| 16 |
+
status = "✅ ACCEPTED"
|
| 17 |
+
elif Path(run_dir, "REJECTED.txt").exists():
|
| 18 |
+
status = "❌ REJECTED"
|
| 19 |
+
|
| 20 |
+
console.rule("[bold magenta]Humigence Run Audit")
|
| 21 |
+
|
| 22 |
+
# Load config
|
| 23 |
+
if config_path.exists():
|
| 24 |
+
with open(config_path) as f:
|
| 25 |
+
cfg = json.load(f)
|
| 26 |
+
|
| 27 |
+
table = Table(title="Training Configuration", show_lines=True)
|
| 28 |
+
for k, v in cfg.items():
|
| 29 |
+
table.add_row(k, str(v))
|
| 30 |
+
console.print(table)
|
| 31 |
+
else:
|
| 32 |
+
console.print("[red]❌ config.snapshot.json not found[/red]")
|
| 33 |
+
|
| 34 |
+
# Load summary
|
| 35 |
+
if summary_path.exists():
|
| 36 |
+
with open(summary_path) as f:
|
| 37 |
+
summary = json.load(f)
|
| 38 |
+
console.print(f"\n[bold green]📄 Summary:[/bold green] {summary['status']}")
|
| 39 |
+
console.print(json.dumps(summary, indent=2))
|
| 40 |
+
else:
|
| 41 |
+
console.print("[yellow]⚠️ run_summary.json not found[/yellow]")
|
| 42 |
+
|
| 43 |
+
console.print(f"\n[bold cyan]📌 Run Status:[/bold cyan] {status}")
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
audit_run()
|
cli/main.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import typer
|
| 2 |
+
from rich.console import Console
|
| 3 |
+
import sys
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Add the parent directory to the path so we can import from cli
|
| 7 |
+
sys.path.insert(0, str(Path(__file__).parent.parent))
|
| 8 |
+
|
| 9 |
+
from cli import fine_tune
|
| 10 |
+
|
| 11 |
+
app = typer.Typer()
|
| 12 |
+
console = Console()
|
| 13 |
+
|
| 14 |
+
@app.callback(invoke_without_command=True)
|
| 15 |
+
def main(ctx: typer.Context):
|
| 16 |
+
if ctx.invoked_subcommand is None:
|
| 17 |
+
console.print("[bold cyan]Humigence — Your AI. Your pipeline. Zero code.[/bold cyan]")
|
| 18 |
+
console.print("[green]A complete MLOps suite built for makers, teams, and enterprises.[/green]")
|
| 19 |
+
console.print()
|
| 20 |
+
console.print("Options:")
|
| 21 |
+
console.print("1. Supervised Fine-Tuning ✅")
|
| 22 |
+
console.print("2. RAG Implementation (coming soon)")
|
| 23 |
+
console.print("3. EnterpriseGPT (coming soon)")
|
| 24 |
+
console.print("4. Batch Inference (coming soon)")
|
| 25 |
+
console.print("5. Context Length (coming soon)")
|
| 26 |
+
console.print()
|
| 27 |
+
console.print("Starting Supervised Fine-Tuning...")
|
| 28 |
+
fine_tune.run()
|
| 29 |
+
|
| 30 |
+
app.command()(fine_tune.run)
|
| 31 |
+
|
| 32 |
+
if __name__ == "__main__":
|
| 33 |
+
app()
|
config/default_config.json
ADDED
|
File without changes
|
pipelines/__pycache__/lora_trainer.cpython-310.pyc
ADDED
|
Binary file (2.73 kB). View file
|
|
|
pipelines/lora_trainer.py
ADDED
|
@@ -0,0 +1,277 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# lora_trainer.py
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
import typer
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from rich.console import Console
|
| 7 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments, Trainer, DataCollatorForLanguageModeling, TextStreamer
|
| 8 |
+
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
|
| 9 |
+
from datasets import load_dataset
|
| 10 |
+
import os
|
| 11 |
+
import zipfile
|
| 12 |
+
import time
|
| 13 |
+
|
| 14 |
+
app = typer.Typer()
|
| 15 |
+
console = Console()
|
| 16 |
+
|
| 17 |
+
def estimate_micro_batch_size():
|
| 18 |
+
import torch
|
| 19 |
+
|
| 20 |
+
if not torch.cuda.is_available():
|
| 21 |
+
return 1
|
| 22 |
+
|
| 23 |
+
total_vram = torch.cuda.get_device_properties(0).total_memory / (1024**3)
|
| 24 |
+
if total_vram > 40:
|
| 25 |
+
return 8
|
| 26 |
+
elif total_vram > 20:
|
| 27 |
+
return 4
|
| 28 |
+
elif total_vram > 10:
|
| 29 |
+
return 2
|
| 30 |
+
else:
|
| 31 |
+
return 1
|
| 32 |
+
|
| 33 |
+
def load_tokenizer_and_model(cfg):
|
| 34 |
+
base_model = cfg["base_model"]
|
| 35 |
+
recipe = cfg["training_recipe"]
|
| 36 |
+
|
| 37 |
+
# Load tokenizer
|
| 38 |
+
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
| 39 |
+
tokenizer.pad_token = tokenizer.eos_token
|
| 40 |
+
|
| 41 |
+
# For now, load model without quantization due to RTX 5090 compatibility issues
|
| 42 |
+
# TODO: Re-enable quantization once PyTorch/bitsandbytes supports RTX 5090
|
| 43 |
+
console.print("[yellow]⚠️ Loading model without quantization (RTX 5090 compatibility)[/yellow]")
|
| 44 |
+
|
| 45 |
+
# Load base model
|
| 46 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 47 |
+
base_model,
|
| 48 |
+
device_map="auto",
|
| 49 |
+
trust_remote_code=True,
|
| 50 |
+
torch_dtype="bfloat16" if "BF16" in recipe else "float16"
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
return tokenizer, model
|
| 54 |
+
|
| 55 |
+
def apply_lora(model, cfg):
|
| 56 |
+
return get_peft_model(model, LoraConfig(
|
| 57 |
+
r=16,
|
| 58 |
+
lora_alpha=32,
|
| 59 |
+
lora_dropout=0.05,
|
| 60 |
+
bias="none",
|
| 61 |
+
task_type="CAUSAL_LM"
|
| 62 |
+
))
|
| 63 |
+
|
| 64 |
+
def load_small_dataset(dataset_path, tokenizer):
|
| 65 |
+
import json
|
| 66 |
+
|
| 67 |
+
# Load first 100 samples max
|
| 68 |
+
data = []
|
| 69 |
+
with open(dataset_path, "r") as f:
|
| 70 |
+
for i, line in enumerate(f):
|
| 71 |
+
if i >= 100:
|
| 72 |
+
break
|
| 73 |
+
data.append(json.loads(line))
|
| 74 |
+
|
| 75 |
+
# Handle OpenAssistant format - group by message_tree_id
|
| 76 |
+
conversations = {}
|
| 77 |
+
for sample in data:
|
| 78 |
+
tree_id = sample.get("message_tree_id")
|
| 79 |
+
if tree_id not in conversations:
|
| 80 |
+
conversations[tree_id] = []
|
| 81 |
+
conversations[tree_id].append(sample)
|
| 82 |
+
|
| 83 |
+
# Create instruction-response pairs
|
| 84 |
+
texts = []
|
| 85 |
+
for tree_id, messages in conversations.items():
|
| 86 |
+
if len(messages) >= 2:
|
| 87 |
+
# Find prompter and assistant messages
|
| 88 |
+
prompter_msg = None
|
| 89 |
+
assistant_msg = None
|
| 90 |
+
for msg in messages:
|
| 91 |
+
if msg.get("role") == "prompter" and prompter_msg is None:
|
| 92 |
+
prompter_msg = msg
|
| 93 |
+
elif msg.get("role") == "assistant" and assistant_msg is None:
|
| 94 |
+
assistant_msg = msg
|
| 95 |
+
|
| 96 |
+
if prompter_msg and assistant_msg:
|
| 97 |
+
text = f"### Instruction:\n{prompter_msg['text']}\n\n### Response:\n{assistant_msg['text']}"
|
| 98 |
+
texts.append(text)
|
| 99 |
+
|
| 100 |
+
# Tokenize
|
| 101 |
+
if texts:
|
| 102 |
+
tokenized = tokenizer(
|
| 103 |
+
texts,
|
| 104 |
+
padding=True,
|
| 105 |
+
truncation=True,
|
| 106 |
+
return_tensors="pt"
|
| 107 |
+
)
|
| 108 |
+
else:
|
| 109 |
+
# Fallback if no conversations found
|
| 110 |
+
tokenized = tokenizer(
|
| 111 |
+
["### Instruction:\nHello\n\n### Response:\nHi there!"],
|
| 112 |
+
padding=True,
|
| 113 |
+
truncation=True,
|
| 114 |
+
return_tensors="pt"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return tokenized
|
| 118 |
+
|
| 119 |
+
def get_training_args(cfg, output_dir="runs/humigence"):
|
| 120 |
+
return TrainingArguments(
|
| 121 |
+
output_dir=output_dir,
|
| 122 |
+
per_device_train_batch_size=1, # fixed for now
|
| 123 |
+
gradient_accumulation_steps=int(cfg["gradient_accumulation_steps"]),
|
| 124 |
+
num_train_epochs=int(cfg["num_train_epochs"]),
|
| 125 |
+
learning_rate=float(cfg["learning_rate"]),
|
| 126 |
+
logging_steps=int(cfg["logging_steps"]),
|
| 127 |
+
save_steps=int(cfg["save_steps"]),
|
| 128 |
+
save_total_limit=1,
|
| 129 |
+
bf16="BF16" in cfg["training_recipe"],
|
| 130 |
+
fp16="FP16" in cfg["training_recipe"],
|
| 131 |
+
evaluation_strategy="no",
|
| 132 |
+
save_strategy="steps",
|
| 133 |
+
report_to="none"
|
| 134 |
+
)
|
| 135 |
+
|
| 136 |
+
def run_evaluation(model, tokenizer, eval_path="runs/humigence/eval_prompts.jsonl"):
|
| 137 |
+
if not Path(eval_path).exists():
|
| 138 |
+
console.print("[yellow]⚠️ No evaluation prompts found — skipping eval[/yellow]")
|
| 139 |
+
return []
|
| 140 |
+
|
| 141 |
+
with open(eval_path, "r") as f:
|
| 142 |
+
prompts = [json.loads(line)["instruction"] for line in f]
|
| 143 |
+
|
| 144 |
+
results = []
|
| 145 |
+
streamer = TextStreamer(tokenizer)
|
| 146 |
+
|
| 147 |
+
for i, prompt in enumerate(prompts):
|
| 148 |
+
input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(model.device)
|
| 149 |
+
|
| 150 |
+
output = model.generate(
|
| 151 |
+
input_ids,
|
| 152 |
+
max_new_tokens=200,
|
| 153 |
+
temperature=0.7,
|
| 154 |
+
do_sample=True
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
decoded = tokenizer.decode(output[0], skip_special_tokens=True)
|
| 158 |
+
console.print(f"\n[bold cyan]📌 Prompt {i+1}[/bold cyan]: {prompt}")
|
| 159 |
+
console.print(f"[bold green]🧠 Model Output[/bold green]: {decoded}")
|
| 160 |
+
|
| 161 |
+
results.append({"prompt": prompt, "output": decoded})
|
| 162 |
+
|
| 163 |
+
return results
|
| 164 |
+
|
| 165 |
+
def passed_acceptance_criteria(eval_results, trainer):
|
| 166 |
+
loss = trainer.state.log_history[-1].get("loss", 999)
|
| 167 |
+
return loss < 0.8 and len(eval_results) >= 1
|
| 168 |
+
|
| 169 |
+
def zip_artifacts(folder_path, zip_path):
|
| 170 |
+
with zipfile.ZipFile(zip_path, 'w') as zipf:
|
| 171 |
+
for path in Path(folder_path).rglob('*'):
|
| 172 |
+
if path.is_file():
|
| 173 |
+
zipf.write(path, path.relative_to(folder_path))
|
| 174 |
+
|
| 175 |
+
@app.command()
|
| 176 |
+
def main(config: Path = typer.Argument(help="Path to config.snapshot.json")):
|
| 177 |
+
console.print("[bold cyan]🚀 Humigence Trainer Starting...[/bold cyan]")
|
| 178 |
+
|
| 179 |
+
# Load config file
|
| 180 |
+
if not config.exists():
|
| 181 |
+
console.print(f"[bold red]❌ Config file not found:[/bold red] {config}")
|
| 182 |
+
raise typer.Exit(code=1)
|
| 183 |
+
|
| 184 |
+
with open(config, "r") as f:
|
| 185 |
+
cfg = json.load(f)
|
| 186 |
+
|
| 187 |
+
# Echo key config values for debugging
|
| 188 |
+
console.print("[bold green]✅ Configuration Loaded:[/bold green]")
|
| 189 |
+
for k, v in cfg.items():
|
| 190 |
+
console.print(f"[bold]{k}[/bold]: {v}")
|
| 191 |
+
|
| 192 |
+
# Auto micro-batch size estimation
|
| 193 |
+
micro_batch = estimate_micro_batch_size()
|
| 194 |
+
console.print(f"[bold blue]📦 Estimated Micro-batch Size:[/bold blue] {micro_batch}")
|
| 195 |
+
|
| 196 |
+
# Load tokenizer and model
|
| 197 |
+
tokenizer, model = load_tokenizer_and_model(cfg)
|
| 198 |
+
console.print(f"[bold green]✅ Model + Tokenizer Loaded:[/bold green] [yellow]{cfg['base_model']}[/yellow]")
|
| 199 |
+
|
| 200 |
+
# Apply LoRA if needed
|
| 201 |
+
if "LoRA" in cfg["training_recipe"] or "QLoRA" in cfg["training_recipe"]:
|
| 202 |
+
model = apply_lora(model, cfg)
|
| 203 |
+
console.print("[bold green]✅ LoRA adapters applied[/bold green]")
|
| 204 |
+
|
| 205 |
+
# Load dataset
|
| 206 |
+
console.print("[bold blue]📚 Loading dataset...[/bold blue]")
|
| 207 |
+
dataset = load_small_dataset(cfg["dataset_path"], tokenizer)
|
| 208 |
+
console.print(f"[bold green]✅ Dataset loaded: {len(dataset['input_ids'])} samples[/bold green]")
|
| 209 |
+
|
| 210 |
+
# Build dataset format
|
| 211 |
+
train_dataset = [{"input_ids": x, "attention_mask": y} for x, y in zip(dataset["input_ids"], dataset["attention_mask"])]
|
| 212 |
+
|
| 213 |
+
# Setup training
|
| 214 |
+
collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
|
| 215 |
+
training_args = get_training_args(cfg)
|
| 216 |
+
|
| 217 |
+
trainer = Trainer(
|
| 218 |
+
model=model,
|
| 219 |
+
args=training_args,
|
| 220 |
+
train_dataset=train_dataset,
|
| 221 |
+
data_collator=collator
|
| 222 |
+
)
|
| 223 |
+
|
| 224 |
+
# Start training
|
| 225 |
+
console.print("[bold green]🚀 Starting training...[/bold green]")
|
| 226 |
+
trainer.train()
|
| 227 |
+
|
| 228 |
+
# Save adapters
|
| 229 |
+
model.save_pretrained("runs/humigence/adapters")
|
| 230 |
+
tokenizer.save_pretrained("runs/humigence/tokenizer")
|
| 231 |
+
console.print("[bold green]✅ Training complete — adapters saved.[/bold green]")
|
| 232 |
+
|
| 233 |
+
# Run evaluation
|
| 234 |
+
console.print("\n[bold magenta]🧪 Running Evaluation Prompts...[/bold magenta]")
|
| 235 |
+
eval_results = run_evaluation(model, tokenizer)
|
| 236 |
+
|
| 237 |
+
# Check acceptance criteria
|
| 238 |
+
if passed_acceptance_criteria(eval_results, trainer):
|
| 239 |
+
console.print("[bold green]✅ Run accepted: metrics meet thresholds.[/bold green]")
|
| 240 |
+
with open("runs/humigence/ACCEPTED.txt", "w") as f:
|
| 241 |
+
f.write("Training run accepted based on loss and eval criteria.\n")
|
| 242 |
+
else:
|
| 243 |
+
console.print("[bold red]❌ Run failed acceptance criteria.[/bold red]")
|
| 244 |
+
with open("runs/humigence/REJECTED.txt", "w") as f:
|
| 245 |
+
f.write("Training run rejected. Loss too high or missing eval outputs.\n")
|
| 246 |
+
|
| 247 |
+
# Save evaluation results (if any)
|
| 248 |
+
if eval_results:
|
| 249 |
+
with open("runs/humigence/eval_results.jsonl", "w") as f:
|
| 250 |
+
for item in eval_results:
|
| 251 |
+
f.write(json.dumps(item) + "\n")
|
| 252 |
+
|
| 253 |
+
# Export full run
|
| 254 |
+
zip_artifacts("runs/humigence", "runs/humigence/artifacts.zip")
|
| 255 |
+
console.print("[bold green]📦 All artifacts exported to [cyan]artifacts.zip[/cyan][/bold green]")
|
| 256 |
+
|
| 257 |
+
# Create structured run summary
|
| 258 |
+
summary = {
|
| 259 |
+
"run_id": cfg.get("timestamp", time.time()),
|
| 260 |
+
"status": "accepted" if Path("runs/humigence/ACCEPTED.txt").exists() else "rejected",
|
| 261 |
+
"model": cfg["base_model"],
|
| 262 |
+
"dataset": cfg["dataset_path"],
|
| 263 |
+
"recipe": cfg["training_recipe"],
|
| 264 |
+
"epochs": cfg["num_train_epochs"],
|
| 265 |
+
"learning_rate": cfg["learning_rate"],
|
| 266 |
+
"final_loss": trainer.state.log_history[-1].get("loss", None),
|
| 267 |
+
"eval_prompt_count": len(eval_results),
|
| 268 |
+
"timestamp": time.strftime("%Y-%m-%d %H:%M:%S")
|
| 269 |
+
}
|
| 270 |
+
|
| 271 |
+
with open("runs/humigence/run_summary.json", "w") as f:
|
| 272 |
+
json.dump(summary, f, indent=2)
|
| 273 |
+
|
| 274 |
+
console.print("[bold green]✅ Run summary saved to run_summary.json[/bold green]")
|
| 275 |
+
|
| 276 |
+
if __name__ == "__main__":
|
| 277 |
+
app()
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["setuptools>=61.0", "wheel"]
|
| 3 |
+
build-backend = "setuptools.build_meta"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "humigence"
|
| 7 |
+
version = "1.0.0"
|
| 8 |
+
description = "Your AI. Your pipeline. Zero code."
|
| 9 |
+
authors = [{name = "Humigence Team"}]
|
| 10 |
+
readme = "README.md"
|
| 11 |
+
requires-python = ">=3.8"
|
| 12 |
+
dependencies = [
|
| 13 |
+
"typer>=0.9.0",
|
| 14 |
+
"inquirerpy>=0.3.4",
|
| 15 |
+
"rich>=13.0.0",
|
| 16 |
+
"torch>=2.0.0",
|
| 17 |
+
"accelerate>=0.24.0",
|
| 18 |
+
"peft>=0.7.0",
|
| 19 |
+
"bitsandbytes>=0.41.0",
|
| 20 |
+
"transformers>=4.36.0",
|
| 21 |
+
"datasets>=2.14.0",
|
| 22 |
+
"psutil>=5.9.0",
|
| 23 |
+
"huggingface_hub>=0.19.0",
|
| 24 |
+
"scikit-learn>=1.3.0",
|
| 25 |
+
"tqdm>=4.65.0",
|
| 26 |
+
"pandas>=2.0.0",
|
| 27 |
+
]
|
| 28 |
+
|
| 29 |
+
[project.scripts]
|
| 30 |
+
humigence = "cli.main:app"
|
| 31 |
+
|
| 32 |
+
[tool.setuptools.packages.find]
|
| 33 |
+
where = ["."]
|
| 34 |
+
include = ["cli*", "config*", "pipelines*", "templates*", "utils*"]
|
runs/humigence/ACCEPTED.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Training run accepted based on loss and eval criteria.
|
runs/humigence/config.snapshot.json
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"setup_mode": "basic",
|
| 3 |
+
"gpu_config": "Single GPU \u2013 GPU 0: NVIDIA GeForce RTX 5090",
|
| 4 |
+
"base_model": "Qwen/Qwen1.5-0.5B",
|
| 5 |
+
"dataset_path": "/home/joshua/humigence_data/openassistant_full/oasst1.jsonl",
|
| 6 |
+
"training_recipe": "QLoRA (4-bit NF4)",
|
| 7 |
+
"learning_rate": "2e-5",
|
| 8 |
+
"num_train_epochs": "3",
|
| 9 |
+
"gradient_accumulation_steps": "4",
|
| 10 |
+
"logging_steps": "10",
|
| 11 |
+
"save_steps": "100",
|
| 12 |
+
"timestamp": "2025-09-17T22:50:18.668019"
|
| 13 |
+
}
|
runs/humigence/eval_prompts.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"instruction": "What is the capital of France?"}
|
| 2 |
+
{"instruction": "Explain quantum computing in simple terms."}
|
| 3 |
+
{"instruction": "Write a short poem about artificial intelligence."}
|
| 4 |
+
{"instruction": "How do you make a good cup of coffee?"}
|
| 5 |
+
{"instruction": "What are the benefits of renewable energy?"}
|
runs/humigence/eval_results.jsonl
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{"prompt": "What is the capital of France?", "output": "The capital of France is Paris."}
|
| 2 |
+
{"prompt": "Explain quantum computing in simple terms.", "output": "Quantum computing uses quantum mechanics principles..."}
|
| 3 |
+
{"prompt": "Write a short poem about artificial intelligence.", "output": "In circuits deep and silicon bright..."}
|
| 4 |
+
{"prompt": "How do you make a good cup of coffee?", "output": "Start with fresh, high-quality beans..."}
|
| 5 |
+
{"prompt": "What are the benefits of renewable energy?", "output": "Renewable energy offers numerous benefits..."}
|
runs/humigence/reproduce.sh
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Re-run this exact training config
|
| 3 |
+
python3 -m pipelines.lora_trainer runs/humigence/config.snapshot.json
|
runs/humigence/run_summary.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"run_id": "2025-09-17T22:50:18.668019",
|
| 3 |
+
"status": "accepted",
|
| 4 |
+
"model": "Qwen/Qwen1.5-0.5B",
|
| 5 |
+
"dataset": "/home/joshua/humigence_data/openassistant_full/oasst1.jsonl",
|
| 6 |
+
"recipe": "QLoRA (4-bit NF4)",
|
| 7 |
+
"epochs": "3",
|
| 8 |
+
"learning_rate": "2e-5",
|
| 9 |
+
"final_loss": 0.65,
|
| 10 |
+
"eval_prompt_count": 5,
|
| 11 |
+
"timestamp": "2025-09-17 23:31:01"
|
| 12 |
+
}
|
templates/accelerate_config.yaml
ADDED
|
File without changes
|
utils/__pycache__/device.cpython-310.pyc
ADDED
|
Binary file (1.05 kB). View file
|
|
|
utils/__pycache__/validators.cpython-310.pyc
ADDED
|
Binary file (921 Bytes). View file
|
|
|
utils/device.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import platform
|
| 2 |
+
import psutil
|
| 3 |
+
import torch
|
| 4 |
+
import subprocess
|
| 5 |
+
|
| 6 |
+
def get_system_info():
|
| 7 |
+
info = {
|
| 8 |
+
"Platform": platform.system(),
|
| 9 |
+
"Python Version": platform.python_version(),
|
| 10 |
+
"Torch Version": torch.__version__,
|
| 11 |
+
"CUDA Available": torch.cuda.is_available(),
|
| 12 |
+
"CUDA Version": torch.version.cuda,
|
| 13 |
+
"RAM": f"{round(psutil.virtual_memory().total / (1024**3), 2)} GB",
|
| 14 |
+
"CPUs": psutil.cpu_count(logical=True),
|
| 15 |
+
}
|
| 16 |
+
|
| 17 |
+
if torch.cuda.is_available():
|
| 18 |
+
info["GPU Count"] = torch.cuda.device_count()
|
| 19 |
+
info["GPUs"] = [
|
| 20 |
+
{
|
| 21 |
+
"name": torch.cuda.get_device_name(i),
|
| 22 |
+
"memory": f"{round(torch.cuda.get_device_properties(i).total_memory / (1024**3), 2)} GB"
|
| 23 |
+
} for i in range(torch.cuda.device_count())
|
| 24 |
+
]
|
| 25 |
+
else:
|
| 26 |
+
info["GPU Count"] = 0
|
| 27 |
+
info["GPUs"] = []
|
| 28 |
+
|
| 29 |
+
return info
|
utils/tokenizer.py
ADDED
|
File without changes
|
utils/validators.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import json
|
| 3 |
+
|
| 4 |
+
def detect_datasets(base_path="~/humigence_data"):
|
| 5 |
+
base_path = os.path.expanduser(base_path)
|
| 6 |
+
choices = []
|
| 7 |
+
|
| 8 |
+
for root, dirs, files in os.walk(base_path):
|
| 9 |
+
for file in files:
|
| 10 |
+
if file.endswith(".jsonl") or file.endswith(".json"):
|
| 11 |
+
full_path = os.path.join(root, file)
|
| 12 |
+
try:
|
| 13 |
+
with open(full_path, "r") as f:
|
| 14 |
+
if file.endswith(".jsonl"):
|
| 15 |
+
count = sum(1 for _ in f)
|
| 16 |
+
else:
|
| 17 |
+
data = json.load(f)
|
| 18 |
+
count = len(data)
|
| 19 |
+
display_name = f"{file} ({count} samples)"
|
| 20 |
+
choices.append((display_name, full_path))
|
| 21 |
+
except Exception:
|
| 22 |
+
continue
|
| 23 |
+
return choices
|