Text Generation
Transformers
English
vortex
science
physics
chemistry
biology
mathematics
ssm
mamba
hybrid-architecture
custom-tokenizer
from-scratch
matrix-corp
Instructions to use Matrix-Corp/Vortex-13b-V1 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Matrix-Corp/Vortex-13b-V1 with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Matrix-Corp/Vortex-13b-V1")# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("Matrix-Corp/Vortex-13b-V1", dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Matrix-Corp/Vortex-13b-V1 with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Matrix-Corp/Vortex-13b-V1" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Matrix-Corp/Vortex-13b-V1", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker
docker model run hf.co/Matrix-Corp/Vortex-13b-V1
- SGLang
How to use Matrix-Corp/Vortex-13b-V1 with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Matrix-Corp/Vortex-13b-V1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Matrix-Corp/Vortex-13b-V1", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Matrix-Corp/Vortex-13b-V1" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Matrix-Corp/Vortex-13b-V1", "prompt": "Once upon a time,", "max_tokens": 512, "temperature": 0.5 }' - Docker Model Runner
How to use Matrix-Corp/Vortex-13b-V1 with Docker Model Runner:
docker model run hf.co/Matrix-Corp/Vortex-13b-V1
| """ | |
| CUDA optimizations for Vortex model on Nvidia 4060 laptop. | |
| Flash Attention 2, torch.compile, INT8 quantization. | |
| """ | |
| import torch | |
| import torch.nn as nn | |
| from typing import Optional, Dict, Any | |
| def optimize_for_cuda( | |
| model: nn.Module, | |
| config: Dict, | |
| use_flash_attention: bool = True, | |
| use_torch_compile: bool = True, | |
| compile_mode: str = "reduce-overhead", | |
| quantization: Optional[str] = None, | |
| ) -> nn.Module: | |
| """ | |
| Apply CUDA optimizations to model. | |
| Args: | |
| model: VortexModel | |
| config: Model config | |
| use_flash_attention: Enable Flash Attention 2 | |
| use_torch_compile: Use torch.compile | |
| compile_mode: Compile mode ("reduce-overhead", "max-autotune") | |
| quantization: None, "int8", or "int4" | |
| Returns: | |
| Optimized model | |
| """ | |
| device = torch.device("cuda") | |
| # Move to CUDA | |
| model = model.to(device) | |
| # Set dtype | |
| dtype_str = config.get("dtype", "bfloat16") | |
| if dtype_str == "bfloat16": | |
| dtype = torch.bfloat16 | |
| elif dtype_str == "float16": | |
| dtype = torch.float16 | |
| else: | |
| dtype = torch.float32 | |
| model = model.to(dtype) | |
| # Apply Flash Attention 2 to attention layers | |
| if use_flash_attention: | |
| model = _apply_flash_attention(model) | |
| print("Applied Flash Attention 2") | |
| # Apply torch.compile | |
| if use_torch_compile: | |
| model = torch.compile( | |
| model, | |
| mode=compile_mode, | |
| fullgraph=True, | |
| dynamic=True, | |
| ) | |
| print(f"Applied torch.compile with mode={compile_mode}") | |
| # Apply quantization if requested | |
| if quantization == "int8": | |
| model = _apply_int8_quantization(model) | |
| print("Applied INT8 quantization") | |
| elif quantization == "int4": | |
| model = _apply_int4_quantization(model) | |
| print("Applied INT4 quantization") | |
| return model | |
| def _apply_flash_attention(model: nn.Module) -> nn.Module: | |
| """ | |
| Replace standard attention with Flash Attention 2. | |
| Requires: pip install flash-attn | |
| """ | |
| try: | |
| from flash_attn import flash_attn_func | |
| # Monkey-patch attention layers to use flash attention | |
| for name, module in model.named_modules(): | |
| if hasattr(module, 'use_flash_attention'): | |
| module.use_flash_attention = True | |
| # Replace forward with flash attention version | |
| original_forward = module.forward | |
| def flash_forward(self, x, *args, **kwargs): | |
| return self._flash_attention_forward(x, *args, **kwargs) | |
| module.forward = flash_forward.__get__(module, type(module)) | |
| return model | |
| except ImportError: | |
| print("Flash Attention not available. Install with: pip install flash-attn") | |
| return model | |
| def _apply_int8_quantization(model: nn.Module) -> nn.Module: | |
| """ | |
| Apply INT8 quantization using bitsandbytes. | |
| """ | |
| try: | |
| import bitsandbytes as bnb | |
| # Replace linear layers with 8-bit variants | |
| for name, module in model.named_modules(): | |
| if isinstance(module, nn.Linear): | |
| # Create 8-bit linear replacement | |
| parent_name = name.rsplit('.', 1)[0] if '.' in name else '' | |
| child_name = name.rsplit('.', 1)[1] if '.' in name else name | |
| # Get parent module | |
| parent = model | |
| if parent_name: | |
| for part in parent_name.split('.'): | |
| parent = getattr(parent, part) | |
| # Replace with 8-bit linear | |
| replacement = bnb.nn.Linear8bitLt( | |
| module.in_features, | |
| module.out_features, | |
| bias=module.bias is not None, | |
| has_fp16_weights=False, | |
| ) | |
| # Copy weights (will be quantized) | |
| replacement.weight.data = module.weight.data | |
| if module.bias is not None: | |
| replacement.bias.data = module.bias.data | |
| setattr(parent, child_name, replacement) | |
| return model | |
| except ImportError: | |
| print("bitsandbytes not available. Install with: pip install bitsandbytes") | |
| return model | |
| def _apply_int4_quantization(model: nn.Module) -> nn.Module: | |
| """ | |
| Apply INT4 quantization using bitsandbytes. | |
| More aggressive, for 13B on 8GB VRAM. | |
| """ | |
| try: | |
| import bitsandbytes as bnb | |
| for name, module in model.named_modules(): | |
| if isinstance(module, nn.Linear): | |
| parent_name = name.rsplit('.', 1)[0] if '.' in name else '' | |
| child_name = name.rsplit('.', 1)[1] if '.' in name else name | |
| parent = model | |
| if parent_name: | |
| for part in parent_name.split('.'): | |
| parent = getattr(parent, part) | |
| # 4-bit linear | |
| replacement = bnb.nn.Linear4bit( | |
| module.in_features, | |
| module.out_features, | |
| bias=module.bias is not None, | |
| compute_dtype=torch.float16, | |
| compress_statistics=True, | |
| ) | |
| replacement.weight.data = module.weight.data | |
| if module.bias is not None: | |
| replacement.bias.data = module.bias.data | |
| setattr(parent, child_name, replacement) | |
| return model | |
| except ImportError: | |
| print("bitsandbytes not available.") | |
| return model | |
| def get_cuda_memory_usage() -> Dict[str, float]: | |
| """Get current CUDA memory usage in GB.""" | |
| if not torch.cuda.is_available(): | |
| return {"error": "CUDA not available"} | |
| allocated = torch.cuda.memory_allocated() / 1e9 | |
| reserved = torch.cuda.memory_reserved() / 1e9 | |
| max_allocated = torch.cuda.max_memory_allocated() / 1e9 | |
| return { | |
| "allocated_gb": allocated, | |
| "reserved_gb": reserved, | |
| "max_allocated_gb": max_allocated, | |
| } | |
| def profile_model( | |
| model: nn.Module, | |
| input_ids: torch.Tensor, | |
| num_warmup: int = 10, | |
| num_runs: int = 100, | |
| ) -> Dict[str, float]: | |
| """ | |
| Profile model performance. | |
| Args: | |
| model: Model to profile | |
| input_ids: Example input | |
| num_warmup: Number of warmup runs | |
| num_runs: Number of profiling runs | |
| Returns: | |
| Dictionary with timing statistics | |
| """ | |
| model.eval() | |
| device = next(model.parameters()).device | |
| input_ids = input_ids.to(device) | |
| # Warmup | |
| with torch.no_grad(): | |
| for _ in range(num_warmup): | |
| _ = model(input_ids) | |
| # Profile | |
| torch.cuda.synchronize() | |
| import time | |
| start = time.time() | |
| with torch.no_grad(): | |
| for _ in range(num_runs): | |
| _ = model(input_ids) | |
| torch.cuda.synchronize() | |
| elapsed = time.time() - start | |
| avg_time = elapsed / num_runs | |
| tokens_per_sec = input_ids.shape[1] / avg_time | |
| return { | |
| "avg_time_sec": avg_time, | |
| "tokens_per_sec": tokens_per_sec, | |
| } | |
| def test_cuda_optimize(): | |
| """Test CUDA optimizations.""" | |
| if not torch.cuda.is_available(): | |
| print("CUDA not available, skipping test") | |
| return | |
| from models.vortex_model import VortexModel | |
| from configs.vortex_7b_config import VORTEX_7B_CONFIG | |
| config = VORTEX_7B_CONFIG.copy() | |
| config["d_model"] = 512 | |
| config["num_layers"] = 2 | |
| config["num_heads"] = 8 | |
| config["vocab_size"] = 1000 | |
| model = VortexModel(config) | |
| print(f"Model parameters: {model.get_num_params():,}") | |
| # Optimize | |
| model = optimize_for_cuda( | |
| model, | |
| config, | |
| use_flash_attention=False, # May not be available | |
| use_torch_compile=False, # Skip compile for test | |
| quantization=None, | |
| ) | |
| # Test forward | |
| batch_size = 2 | |
| seq_len = 128 | |
| input_ids = torch.randint(0, config["vocab_size"], (batch_size, seq_len)).cuda() | |
| with torch.no_grad(): | |
| output = model(input_ids) | |
| logits = output["logits"] | |
| print(f"Output shape: {logits.shape}") | |
| print("CUDA optimize test passed!") | |
| if __name__ == "__main__": | |
| test_cuda_optimize() | |