| | import torch |
| | from typing import Dict, List, Any |
| | from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline |
| |
|
| | |
| | dtype = torch.bfloat16 if torch.cuda.get_device_capability()[0] == 8 else torch.float16 |
| |
|
| |
|
| | class EndpointHandler: |
| | def __init__(self, path=""): |
| | |
| | tokenizer = AutoTokenizer.from_pretrained(path) |
| | model = AutoModelForCausalLM.from_pretrained(path, device_map="auto",torch_dtype=dtype) |
| | |
| | self.pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer) |
| |
|
| | def __call__(self, data: Any) -> List[List[Dict[str, float]]]: |
| | inputs = data.pop("inputs", data) |
| | parameters = data.pop("parameters", None) |
| |
|
| | |
| | if parameters is not None: |
| | prediction = self.pipeline(inputs, **parameters) |
| | else: |
| | prediction = self.pipeline(inputs) |
| | |
| | return prediction |
| |
|