File size: 6,666 Bytes
0e8466f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
from transformers import AutoTokenizer
from http.server import HTTPServer, BaseHTTPRequestHandler
import json
import argparse
class Tokenizer_Http():
def __init__(self):
path = 'internvl3_tokenizer'
self.tokenizer = AutoTokenizer.from_pretrained(path,
trust_remote_code=True,
use_fast=False)
def encode(self, content):
prompt = f"<|im_start|>system\n你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型,英文名叫InternVL, 是一个有用无害的人工智能助手。<|im_end|><|im_start|>user\n{content}<|im_end|><|im_start|>assistant\n"
input_ids = self.tokenizer.encode(prompt)
return input_ids
def encode_with_image(self, question, num_of_images, imgsz) -> list:
prompt = "<|im_start|>system\n你是书生·万象, 英文名是InternVL, 是由上海人工智能实验室、清华大学及多家合作单位联合开发的多模态大语言模型.<|im_end|>\n"
prompt += "<|im_start|>user\n" + question
context_len = 64
if imgsz == 448:
context_len = 256
elif imgsz == 224:
context_len = 64
else:
print(f"imgsz is {imgsz}")
return
print("context_len is ", context_len)
if num_of_images > 0:
for idx in range(num_of_images):
prompt += "\n<img>" + "<IMG_CONTEXT>" * context_len + "</img>\n"
prompt += "<|im_end|>\n<|im_start|>assistant"
print(f"prompt is {prompt}")
token_ids = self.tokenizer.encode(prompt)
return token_ids
def decode(self, token_ids):
return self.tokenizer.decode(token_ids,
clean_up_tokenization_spaces=False, skip_special_tokens=True)
@property
def bos_id(self):
return self.tokenizer.bos_token_id
@property
def eos_id(self):
return self.tokenizer.eos_token_id
@property
def bos_token(self):
return self.tokenizer.bos_token
@property
def eos_token(self):
return self.tokenizer.eos_token
@property
def img_start_token(self):
return self.tokenizer.encode("<img>")[0]
@property
def img_context_token(self):
return self.tokenizer.encode("<IMG_CONTEXT>")[0]
tokenizer = Tokenizer_Http()
print(tokenizer.bos_id, tokenizer.bos_token, tokenizer.eos_id,
tokenizer.eos_token, tokenizer.img_start_token, tokenizer.img_context_token)
token_ids = tokenizer.encode_with_image("你好", 1, 448)
print(token_ids)
print(len(token_ids))
token_ids = tokenizer.encode("hello world")
print(token_ids)
print(len(token_ids))
class Request(BaseHTTPRequestHandler):
#通过类继承,新定义类
timeout = 5
server_version = 'Apache'
def do_GET(self):
print(self.path)
#在新类中定义get的内容(当客户端向该服务端使用get请求时,本服务端将如下运行)
self.send_response(200)
self.send_header("type", "get") #设置响应头,可省略或设置多个
self.end_headers()
if self.path == '/bos_id':
bos_id = tokenizer.bos_id
# print(bos_id)
# to json
if bos_id is None:
msg = json.dumps({'bos_id': -1})
else:
msg = json.dumps({'bos_id': bos_id})
elif self.path == '/eos_id':
eos_id = tokenizer.eos_id
if eos_id is None:
msg = json.dumps({'eos_id': -1})
else:
msg = json.dumps({'eos_id': eos_id})
elif self.path == '/img_start_token':
img_start_token = tokenizer.img_start_token
if img_start_token is None:
msg = json.dumps({'img_start_token': -1})
else:
msg = json.dumps({'img_start_token': img_start_token})
elif self.path == '/img_context_token':
img_context_token = tokenizer.img_context_token
if img_context_token is None:
msg = json.dumps({'img_context_token': -1})
else:
msg = json.dumps({'img_context_token': img_context_token})
else:
msg = 'error'
print(msg)
msg = str(msg).encode() #转为str再转为byte格式
self.wfile.write(msg) #将byte格式的信息返回给客户端
def do_POST(self):
#在新类中定义post的内容(当客户端向该服务端使用post请求时,本服务端将如下运行)
data = self.rfile.read(int(
self.headers['content-length'])) #获取从客户端传入的参数(byte格式)
data = data.decode() #将byte格式转为str格式
self.send_response(200)
self.send_header("type", "post") #设置响应头,可省略或设置多个
self.end_headers()
if self.path == '/encode':
req = json.loads(data)
print(req)
prompt = req['text']
b_img_prompt = False
if 'img_prompt' in req:
b_img_prompt = req['img_prompt']
if b_img_prompt:
num_img = req['num_img']
imgsz = req['imgsz']
token_ids = tokenizer.encode_with_image(prompt, num_img, imgsz)
else:
token_ids = tokenizer.encode(prompt)
if token_ids is None:
msg = json.dumps({'token_ids': -1})
else:
msg = json.dumps({'token_ids': token_ids})
elif self.path == '/decode':
req = json.loads(data)
token_ids = req['token_ids']
text = tokenizer.decode(token_ids)
if text is None:
msg = json.dumps({'text': ""})
else:
msg = json.dumps({'text': text})
else:
msg = 'error'
print(msg)
msg = str(msg).encode() #转为str再转为byte格式
self.wfile.write(msg) #将byte格式的信息返回给客户端
if __name__ == "__main__":
args = argparse.ArgumentParser()
args.add_argument('--host', type=str, default='0.0.0.0')
args.add_argument('--port', type=int, default=12345)
args = args.parse_args()
host = (args.host, args.port) #设定地址与端口号,'localhost'等价于'127.0.0.1'
print('http://%s:%s' % host)
server = HTTPServer(host, Request) #根据地址端口号和新定义的类,创建服务器实例
server.serve_forever() #开启服务
|