Instructions to use Deci/DeciLM-7B-instruct with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use Deci/DeciLM-7B-instruct with Transformers:
# Use a pipeline as a high-level helper from transformers import pipeline pipe = pipeline("text-generation", model="Deci/DeciLM-7B-instruct", trust_remote_code=True) messages = [ {"role": "user", "content": "Who are you?"}, ] pipe(messages)# Load model directly from transformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("Deci/DeciLM-7B-instruct", trust_remote_code=True, dtype="auto") - Notebooks
- Google Colab
- Kaggle
- Local Apps
- vLLM
How to use Deci/DeciLM-7B-instruct with vLLM:
Install from pip and serve model
# Install vLLM from pip: pip install vllm # Start the vLLM server: vllm serve "Deci/DeciLM-7B-instruct" # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:8000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Deci/DeciLM-7B-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker
docker model run hf.co/Deci/DeciLM-7B-instruct
- SGLang
How to use Deci/DeciLM-7B-instruct with SGLang:
Install from pip and serve model
# Install SGLang from pip: pip install sglang # Start the SGLang server: python3 -m sglang.launch_server \ --model-path "Deci/DeciLM-7B-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Deci/DeciLM-7B-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }'Use Docker images
docker run --gpus all \ --shm-size 32g \ -p 30000:30000 \ -v ~/.cache/huggingface:/root/.cache/huggingface \ --env "HF_TOKEN=<secret>" \ --ipc=host \ lmsysorg/sglang:latest \ python3 -m sglang.launch_server \ --model-path "Deci/DeciLM-7B-instruct" \ --host 0.0.0.0 \ --port 30000 # Call the server using curl (OpenAI-compatible API): curl -X POST "http://localhost:30000/v1/chat/completions" \ -H "Content-Type: application/json" \ --data '{ "model": "Deci/DeciLM-7B-instruct", "messages": [ { "role": "user", "content": "What is the capital of France?" } ] }' - Docker Model Runner
How to use Deci/DeciLM-7B-instruct with Docker Model Runner:
docker model run hf.co/Deci/DeciLM-7B-instruct
| import json | |
| from argparse import ArgumentParser | |
| import datasets | |
| import torch | |
| import transformers | |
| from transformers import AutoModelForCausalLM, BatchEncoding | |
| """ | |
| Usage examples (with the best batch sizes on A100-80GB-400W) | |
| ============================================================ | |
| python -m benchmark_hf_model --model_name_or_path="Deci/DeciLM-7B" --batch_size=352 | |
| python -m benchmark_hf_model --model_name_or_path="mistralai/Mistral-7B-v0.1" --batch_size=192 --model_kwargs_json='{"use_flash_attention_2": true}' | |
| python -m benchmark_hf_model --model_name_or_path="meta-llama/Llama-2-7b-hf" --batch_size=48 --model_kwargs_json='{"use_flash_attention_2": true}' | |
| """ | |
| def parse_args(): | |
| parser = ArgumentParser() | |
| parser.add_argument( | |
| "--model_name_or_path", | |
| type=str, | |
| required=True, | |
| ) | |
| parser.add_argument( | |
| "--warmup_iters", | |
| type=int, | |
| default=10, | |
| ) | |
| parser.add_argument( | |
| "--iterations", | |
| type=int, | |
| default=5, | |
| ) | |
| parser.add_argument( | |
| "--batch_size", | |
| type=int, | |
| default=32, | |
| ) | |
| parser.add_argument( | |
| "--prompt_length", | |
| type=int, | |
| default=512, | |
| ) | |
| parser.add_argument( | |
| "--max_new_tokens", | |
| type=int, | |
| default=512, | |
| ) | |
| parser.add_argument( | |
| "--precision", | |
| type=str, | |
| default="bf16", | |
| help="Model precision, from: fp32, fp16 or bf16", | |
| ) | |
| parser.add_argument( | |
| "--model_kwargs_json", | |
| type=str, | |
| default=None, | |
| ) | |
| return parser.parse_args() | |
| def main(): | |
| args = parse_args() | |
| transformers.logging.set_verbosity_error() | |
| datasets.logging.set_verbosity_error() | |
| dict_precisions = { | |
| "fp32": torch.float32, | |
| "fp16": torch.float16, | |
| "bf16": torch.bfloat16, | |
| } | |
| if args.precision not in dict_precisions: | |
| raise ValueError( | |
| f"Non valid precision {args.precision}, choose from: fp16, fp32, bf16" | |
| ) | |
| dtype = dict_precisions[args.precision] | |
| model_kwargs = {} | |
| if args.model_kwargs_json is not None: | |
| model_kwargs = json.loads(args.model_kwargs_json) | |
| print(f"loading model...") | |
| model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, trust_remote_code=True, | |
| torch_dtype=dtype, **model_kwargs) | |
| try: | |
| print(model.model.layers[0].self_attn) | |
| except: | |
| print("couldn't print the model's attention module") | |
| starter, ender = torch.cuda.Event(enable_timing=True), torch.cuda.Event(enable_timing=True) | |
| model.cuda() | |
| model.eval() | |
| prompt = torch.ones(args.prompt_length, dtype=torch.long) | |
| inputs = BatchEncoding({"input_ids": prompt.repeat(args.batch_size, 1)}) | |
| inputs = inputs.to(model.device) | |
| # warmup | |
| print(f"warming up for {args.warmup_iters} iterations...") | |
| for _ in range(args.warmup_iters): | |
| with torch.no_grad(): | |
| _ = model.generate( | |
| **inputs, | |
| max_new_tokens=1, | |
| do_sample=False, | |
| eos_token_id=-1234, | |
| ) | |
| print('finished warmup') | |
| torch.cuda.synchronize() | |
| print( | |
| f"prefill ({args.prompt_length} tokens{f' x {args.batch_size} batch' if args.batch_size > 1 else ''}) + generation ({args.max_new_tokens} tokens{f' x {args.batch_size} batch' if args.batch_size > 1 else ''}):") | |
| tokens_generated = args.max_new_tokens * args.batch_size | |
| prefill_and_generation = [] | |
| for gen_iter in range(args.iterations): | |
| starter.record() | |
| with torch.no_grad(): | |
| _ = model.generate( | |
| **inputs, | |
| max_new_tokens=args.max_new_tokens, | |
| do_sample=False, | |
| eos_token_id=-1234, | |
| ) | |
| ender.record() | |
| torch.cuda.synchronize() | |
| t = starter.elapsed_time(ender) / 1000 | |
| prefill_and_generation.append(t) | |
| print(f" iter {gen_iter + 1}: {t:.03f} sec total, {tokens_generated / t:.02f} generated tokens/sec") | |
| aver = sum(prefill_and_generation) / len(prefill_and_generation) | |
| print(f" average: {aver:.03f} sec total, {tokens_generated / aver:.02f} generated tokens/sec") | |
| print(f"These results are obtained for model '{args.model_name_or_path}' with {args.batch_size=}.") | |
| if __name__ == "__main__": | |
| main() | |