Spaces:
Sleeping
Sleeping
| import random | |
| import requests | |
| from flask import Flask, request, Response, stream_with_context, render_template_string | |
| app = Flask(__name__) | |
| def index(): | |
| template = ''' | |
| <html> | |
| <head> | |
| <title>Huggingface Chat API Adapter</title> | |
| </head> | |
| <body> | |
| <h1>Huggingface Chat API Adapter</h1> | |
| [Introduction]<br> | |
| When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br> | |
| This API changes these two default settings, and other parameters are consistent with the official API.<br> | |
| <br> | |
| [How to use]<br> | |
| 1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br> | |
| 2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br> | |
| 3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br> | |
| <br> | |
| [Supported models]<br> | |
| Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br> | |
| Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br> | |
| Some models require a token created by a PRO user to use.<br> | |
| <br> | |
| [Avoid reaching the call limit]<br> | |
| If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br> | |
| </body> | |
| </html> | |
| ''' | |
| return render_template_string(template) | |
| def proxy(): | |
| headers = dict(request.headers) | |
| headers.pop('Host', None) | |
| headers.pop('Content-Length', None) | |
| keys = request.headers['Authorization'].split(' ')[1].replace(';','').split('hf_') | |
| headers['Authorization'] = f'Bearer hf_{random.choice(keys)}' | |
| headers['X-Use-Cache'] = 'false' | |
| json_data = request.get_json() | |
| model = json_data['model'] | |
| chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions" | |
| # gemma does not support system prompt | |
| # add system prompt before user message | |
| if model.startswith('google/gemma') and json_data["messages"][0]['role']=='system': | |
| system_prompt = json_data["messages"][0]['content'] | |
| first_user_content = json_data["messages"][1]['content'] | |
| json_data["messages"][1]['content'] = f'System: {system_prompt}\n\n---\n\n{first_user_content}' | |
| json_data["messages"] = json_data["messages"][1:] | |
| # Try to use the largest ctx | |
| if not 'max_tokens' in json_data: | |
| json_data['max_tokens'] = 2**32-1 | |
| json_data['json_mode'] = True | |
| info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text | |
| json_data['json_mode'] = False | |
| try: | |
| max_ctx = int(info.split("<= ")[1].split(".")[0]) | |
| inputs = int(info.split("Given: ")[1].split("`")[0]) | |
| json_data['max_tokens'] = max_ctx - inputs - 1 | |
| except Exception as e: | |
| print(info) | |
| if not 'seed' in json_data: | |
| json_data['seed'] = random.randint(1,2**32) | |
| def generate(): | |
| with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp: | |
| for chunk in resp.iter_content(chunk_size=1024): | |
| if chunk: | |
| yield chunk | |
| return Response(stream_with_context(generate()), content_type='text/event-stream') | |
| #import gevent.pywsgi | |
| #from gevent import monkey;monkey.patch_all() | |
| if __name__ == "__main__": | |
| app.run(debug=True) | |
| # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever() | |