tastypear commited on
Commit
4fae6ee
·
verified ·
1 Parent(s): fdbe505

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +79 -76
main.py CHANGED
@@ -1,76 +1,79 @@
1
- import random
2
- import requests
3
- from flask import Flask, request, Response, stream_with_context, render_template_string
4
-
5
- app = Flask(__name__)
6
-
7
- @app.route('/', methods=['GET'])
8
- def index():
9
- template = '''
10
- <html>
11
- <head>
12
- <title>Huggingface Chat API Adapter</title>
13
- </head>
14
- <body>
15
- <h1>Huggingface Chat API Adapter</h1>
16
-
17
- [Introduction]<br>
18
- When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
19
- This API changes these two default settings, and other parameters are consistent with the official API.<br>
20
- <br>
21
- [How to use]<br>
22
- 1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
23
- 2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
24
- 3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
25
- <br>
26
- [Supported models]<br>
27
- Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
28
- Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
29
- Some models require a token created by a PRO user to use.<br>
30
- <br>
31
- [Avoid reaching the call limit]<br>
32
- If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
33
- </body>
34
- </html>
35
- '''
36
- return render_template_string(template)
37
-
38
- @app.route('/api/v1/chat/completions', methods=['POST'])
39
- def proxy():
40
- headers = dict(request.headers)
41
- headers.pop('Host', None)
42
- headers.pop('Content-Length', None)
43
- keys = request.headers['Authorization'].split(' ')[1].split(';')
44
- headers['Authorization'] = f'Bearer {random.choice(keys)}'
45
- headers['X-Use-Cache'] = 'false'
46
-
47
- json_data = request.get_json()
48
- model = json_data['model']
49
- chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"
50
-
51
- # Try to use the largest ctx
52
- if not 'max_tokens' in json_data:
53
- json_data['max_tokens'] = 2**32-1
54
- json_data['json_mode'] = True
55
- info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
56
- max_ctx = int(info.split("<= ")[1].split(".")[0])
57
- inputs = int(info.split("Given: ")[1].split("`")[0])
58
- json_data['json_mode'] = False
59
- json_data['max_tokens'] = max_ctx - inputs - 1
60
-
61
- if not 'seed' in json_data:
62
- json_data['seed'] = random.randint(1,2**32)
63
-
64
- def generate():
65
- with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
66
- for chunk in resp.iter_content(chunk_size=1024):
67
- if chunk:
68
- yield chunk
69
-
70
- return Response(stream_with_context(generate()), content_type='text/event-stream')
71
-
72
- #import gevent.pywsgi
73
- #from gevent import monkey;monkey.patch_all()
74
- if __name__ == "__main__":
75
- app.run(debug=True)
76
- # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()
 
 
 
 
1
+ import random
2
+ import requests
3
+ from flask import Flask, request, Response, stream_with_context, render_template_string
4
+
5
+ app = Flask(__name__)
6
+
7
+ @app.route('/', methods=['GET'])
8
+ def index():
9
+ template = '''
10
+ <html>
11
+ <head>
12
+ <title>Huggingface Chat API Adapter</title>
13
+ </head>
14
+ <body>
15
+ <h1>Huggingface Chat API Adapter</h1>
16
+
17
+ [Introduction]<br>
18
+ When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
19
+ This API changes these two default settings, and other parameters are consistent with the official API.<br>
20
+ <br>
21
+ [How to use]<br>
22
+ 1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
23
+ 2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
24
+ 3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
25
+ <br>
26
+ [Supported models]<br>
27
+ Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
28
+ Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
29
+ Some models require a token created by a PRO user to use.<br>
30
+ <br>
31
+ [Avoid reaching the call limit]<br>
32
+ If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
33
+ </body>
34
+ </html>
35
+ '''
36
+ return render_template_string(template)
37
+
38
+ @app.route('/api/v1/chat/completions', methods=['POST'])
39
+ def proxy():
40
+ headers = dict(request.headers)
41
+ headers.pop('Host', None)
42
+ headers.pop('Content-Length', None)
43
+ keys = request.headers['Authorization'].split(' ')[1].split(';')
44
+ headers['Authorization'] = f'Bearer {random.choice(keys)}'
45
+ headers['X-Use-Cache'] = 'false'
46
+
47
+ json_data = request.get_json()
48
+ model = json_data['model']
49
+ chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"
50
+
51
+ # Try to use the largest ctx
52
+ if not 'max_tokens' in json_data:
53
+ json_data['max_tokens'] = 2**32-1
54
+ json_data['json_mode'] = True
55
+ info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
56
+ json_data['json_mode'] = False
57
+ try:
58
+ max_ctx = int(info.split("<= ")[1].split(".")[0])
59
+ inputs = int(info.split("Given: ")[1].split("`")[0])
60
+ json_data['max_tokens'] = max_ctx - inputs - 1
61
+ except Exception e:
62
+ print(e)
63
+
64
+ if not 'seed' in json_data:
65
+ json_data['seed'] = random.randint(1,2**32)
66
+
67
+ def generate():
68
+ with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
69
+ for chunk in resp.iter_content(chunk_size=1024):
70
+ if chunk:
71
+ yield chunk
72
+
73
+ return Response(stream_with_context(generate()), content_type='text/event-stream')
74
+
75
+ #import gevent.pywsgi
76
+ #from gevent import monkey;monkey.patch_all()
77
+ if __name__ == "__main__":
78
+ app.run(debug=True)
79
+ # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()