Spaces:
Running
Running
import random | |
import requests | |
from flask import Flask, request, Response, stream_with_context, render_template_string | |
app = Flask(__name__) | |
def index(): | |
template = ''' | |
<html> | |
<head> | |
<title>Huggingface Chat API Adapter</title> | |
</head> | |
<body> | |
<h1>Huggingface Chat API Adapter</h1> | |
[Introduction]<br> | |
When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br> | |
This API changes these two default settings, and other parameters are consistent with the official API.<br> | |
<br> | |
[How to use]<br> | |
1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br> | |
2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br> | |
3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br> | |
<br> | |
[Supported models]<br> | |
Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br> | |
Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br> | |
Some models require a token created by a PRO user to use.<br> | |
<br> | |
[Avoid reaching the call limit]<br> | |
If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br> | |
</body> | |
</html> | |
''' | |
return render_template_string(template) | |
def proxy(): | |
headers = dict(request.headers) | |
headers.pop('Host', None) | |
headers.pop('Content-Length', None) | |
keys = request.headers['Authorization'].split(' ')[1].replace(';','').split('hf_') | |
headers['Authorization'] = f'Bearer hf_{random.choice(keys)}' | |
headers['X-Use-Cache'] = 'false' | |
json_data = request.get_json() | |
model = json_data['model'] | |
chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions" | |
# gemma does not support system prompt | |
# add system prompt before user message | |
if model.startswith('google/gemma') and json_data["messages"][0]['role']=='system': | |
system_prompt = json_data["messages"][0]['content'] | |
first_user_content = json_data["messages"][1]['content'] | |
json_data["messages"][1]['content'] = f'System: {system_prompt}\n\n---\n\n{first_user_content}' | |
json_data["messages"] = json_data["messages"][1:] | |
# Try to use the largest ctx | |
if not 'max_tokens' in json_data: | |
json_data['max_tokens'] = 2**32-1 | |
json_data['json_mode'] = True | |
info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text | |
json_data['json_mode'] = False | |
try: | |
max_ctx = int(info.split("<= ")[1].split(".")[0]) | |
inputs = int(info.split("Given: ")[1].split("`")[0]) | |
json_data['max_tokens'] = max_ctx - inputs - 1 | |
except Exception as e: | |
print(info) | |
if not 'seed' in json_data: | |
json_data['seed'] = random.randint(1,2**32) | |
def generate(): | |
with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp: | |
for chunk in resp.iter_content(chunk_size=1024): | |
if chunk: | |
yield chunk | |
return Response(stream_with_context(generate()), content_type='text/event-stream') | |
#import gevent.pywsgi | |
#from gevent import monkey;monkey.patch_all() | |
if __name__ == "__main__": | |
app.run(debug=True) | |
# gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever() | |