Spaces:

tastypear
/

sia-chat-adapter

Running

App Files Files Community

sia-chat-adapter / main.py

tastypear

Support multiple keys, no need to use ";" combination

0e2e2c3 verified 11 months ago

raw

history blame contribute delete

3.82 kB

	import random
	import requests
	from flask import Flask, request, Response, stream_with_context, render_template_string

	app = Flask(__name__)

	@app.route('/', methods=['GET'])
	def index():
	template = '''
	<html>
	<head>
	<title>Huggingface Chat API Adapter</title>
	</head>
	<body>
	<h1>Huggingface Chat API Adapter</h1>

	[Introduction]<br>
	When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
	This API changes these two default settings, and other parameters are consistent with the official API.<br>
	<br>
	[How to use]<br>
	1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
	2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
	3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
	<br>
	[Supported models]<br>
	Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
	Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
	Some models require a token created by a PRO user to use.<br>
	<br>
	[Avoid reaching the call limit]<br>
	If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
	</body>
	</html>
	'''
	return render_template_string(template)

	@app.route('/api/v1/chat/completions', methods=['POST'])
	def proxy():
	headers = dict(request.headers)
	headers.pop('Host', None)
	headers.pop('Content-Length', None)
	keys = request.headers['Authorization'].split(' ')[1].replace(';','').split('hf_')
	headers['Authorization'] = f'Bearer hf_{random.choice(keys)}'
	headers['X-Use-Cache'] = 'false'

	json_data = request.get_json()
	model = json_data['model']
	chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"

	# gemma does not support system prompt
	# add system prompt before user message
	if model.startswith('google/gemma') and json_data["messages"][0]['role']=='system':
	system_prompt = json_data["messages"][0]['content']
	first_user_content = json_data["messages"][1]['content']
	json_data["messages"][1]['content'] = f'System: {system_prompt}\n\n---\n\n{first_user_content}'
	json_data["messages"] = json_data["messages"][1:]

	# Try to use the largest ctx
	if not 'max_tokens' in json_data:
	json_data['max_tokens'] = 2**32-1
	json_data['json_mode'] = True
	info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
	json_data['json_mode'] = False
	try:
	max_ctx = int(info.split("<= ")[1].split(".")[0])
	inputs = int(info.split("Given: ")[1].split("`")[0])
	json_data['max_tokens'] = max_ctx - inputs - 1
	except Exception as e:
	print(info)

	if not 'seed' in json_data:
	json_data['seed'] = random.randint(1,2**32)

	def generate():
	with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
	for chunk in resp.iter_content(chunk_size=1024):
	if chunk:
	yield chunk

	return Response(stream_with_context(generate()), content_type='text/event-stream')

	#import gevent.pywsgi
	#from gevent import monkey;monkey.patch_all()
	if __name__ == "__main__":
	app.run(debug=True)
	# gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()