Spaces:

tastypear
/

sia-chat-adapter

Running

App Files Files Community

tastypear commited on Aug 10, 2024

Commit

4fae6ee

verified ·

1 Parent(s): fdbe505

Update main.py

Browse files

Files changed (1) hide show

main.py +79 -76

main.py CHANGED Viewed

@@ -1,76 +1,79 @@
-import random
-import requests
-from flask import Flask, request, Response, stream_with_context, render_template_string
-app = Flask(__name__)
-@app.route('/', methods=['GET'])
-def index():
-    template = '''
-    <html>
-        <head>
-            <title>Huggingface Chat API Adapter</title>
-        </head>
-        <body>
-            <h1>Huggingface Chat API Adapter</h1>
-[Introduction]<br>
-When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
-This API changes these two default settings, and other parameters are consistent with the official API.<br>
-<br>
-[How to use]<br>
-1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
-2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
-3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
-<br>
-[Supported models]<br>
-Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
-Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
-Some models require a token created by a PRO user to use.<br>
-<br>
-[Avoid reaching the call limit]<br>
-If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
-        </body>
-    </html>
-    '''
-    return render_template_string(template)
-@app.route('/api/v1/chat/completions', methods=['POST'])
-def proxy():
-    headers = dict(request.headers)
-    headers.pop('Host', None)
-    headers.pop('Content-Length', None)
-    keys = request.headers['Authorization'].split(' ')[1].split(';')
-    headers['Authorization'] = f'Bearer {random.choice(keys)}'
-    headers['X-Use-Cache'] = 'false'
-    json_data = request.get_json()
-    model = json_data['model']
-    chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"
-    # Try to use the largest ctx
-    if not 'max_tokens' in json_data:
-        json_data['max_tokens'] = 2**32-1
-        json_data['json_mode'] = True
-        info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
-        max_ctx = int(info.split("<= ")[1].split(".")[0])
-        inputs = int(info.split("Given: ")[1].split("`")[0])
-        json_data['json_mode'] = False
-        json_data['max_tokens'] = max_ctx - inputs - 1
-    if not 'seed' in json_data:
-        json_data['seed'] = random.randint(1,2**32)
-    def generate():
-        with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
-            for chunk in resp.iter_content(chunk_size=1024):
-                if chunk:
-                    yield chunk
-    return Response(stream_with_context(generate()), content_type='text/event-stream')
-#import gevent.pywsgi
-#from gevent import monkey;monkey.patch_all()
-if __name__ == "__main__":
-    app.run(debug=True)
-    # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()

+import random
+import requests
+from flask import Flask, request, Response, stream_with_context, render_template_string
+app = Flask(__name__)
+@app.route('/', methods=['GET'])
+def index():
+    template = '''
+    <html>
+        <head>
+            <title>Huggingface Chat API Adapter</title>
+        </head>
+        <body>
+            <h1>Huggingface Chat API Adapter</h1>
+[Introduction]<br>
+When using Huggingface's Serverless Inference API for a conversation, by default 100 new tokens are output and a cache is used.<br>
+This API changes these two default settings, and other parameters are consistent with the official API.<br>
+<br>
+[How to use]<br>
+1. <a target="_blank" href="https://huggingface.co/settings/tokens/new">Create a token</a> with the "Make calls to the serverless Inference API" permission as an API key.<br>
+2. Set the Base URL of the OpenAI compatible client to "https://tastypear-sia-chat-adapter.hf.space/api".<br>
+3. Use the full name of the model (e.g. mistralai/Mistral-Nemo-Instruct-2407)<br>
+<br>
+[Supported models]<br>
+Most of the available models can be found <a target="_blank" href="https://huggingface.co/models?inference=warm&other=text-generation-inference">HERE</a>.<br>
+Some "cold" models may also be supported (e.g. meta-llama/Meta-Llama-3.1-405B-Instruct), please test it yourself.<br>
+Some models require a token created by a PRO user to use.<br>
+<br>
+[Avoid reaching the call limit]<br>
+If you have multiple tokens, you can connect them with a semicolon (";") and the API will use a random one (e.g. "hf_aaaa;hf_bbbb;hf_...")<br>
+        </body>
+    </html>
+    '''
+    return render_template_string(template)
+@app.route('/api/v1/chat/completions', methods=['POST'])
+def proxy():
+    headers = dict(request.headers)
+    headers.pop('Host', None)
+    headers.pop('Content-Length', None)
+    keys = request.headers['Authorization'].split(' ')[1].split(';')
+    headers['Authorization'] = f'Bearer {random.choice(keys)}'
+    headers['X-Use-Cache'] = 'false'
+    json_data = request.get_json()
+    model = json_data['model']
+    chat_api = f"https://api-inference.huggingface.co/models/{model}/v1/chat/completions"
+    # Try to use the largest ctx
+    if not 'max_tokens' in json_data:
+        json_data['max_tokens'] = 2**32-1
+        json_data['json_mode'] = True
+        info = requests.post(chat_api, json=request.json, headers=headers, stream=False).text
+        json_data['json_mode'] = False
+        try:
+            max_ctx = int(info.split("<= ")[1].split(".")[0])
+            inputs = int(info.split("Given: ")[1].split("`")[0])
+            json_data['max_tokens'] = max_ctx - inputs - 1
+        except Exception e:
+            print(e)
+    if not 'seed' in json_data:
+        json_data['seed'] = random.randint(1,2**32)
+    def generate():
+        with requests.post(chat_api, json=request.json, headers=headers, stream=True) as resp:
+            for chunk in resp.iter_content(chunk_size=1024):
+                if chunk:
+                    yield chunk
+    return Response(stream_with_context(generate()), content_type='text/event-stream')
+#import gevent.pywsgi
+#from gevent import monkey;monkey.patch_all()
+if __name__ == "__main__":
+    app.run(debug=True)
+    # gevent.pywsgi.WSGIServer((args.host, args.port), app).serve_forever()