DMIR
Dense medical information retrieval using pre-trained language models, including bidirectional encoder and generative large-scale language models. Electronic medical records usually contain very dense and rich medical information. We aim to use the latest deep learning algorithms and technologies to accomplish the retrieval and re-utilization of key medical information.
MODEL
DeBERTa-V2-710M-Chinese fine tuned in custom datasets
RESULT
- accuracy 99.92%
- precision 88.83%
- recall 89.98%
- f1 89.40%
BEST PRACTICES
from transformers import (
AutoModelForTokenClassification,
AutoTokenizer,
)
import jieba
def align_space(words, words_remove_space, align_preds):
origin_labels = []
no_space_idx = 0
for i, word in enumerate(words):
if word == ' ':
origin_labels.append(1)
if word == words_remove_space[no_space_idx]:
origin_labels.append(align_preds[no_space_idx])
no_space_idx += 1
return origin_labels
def segment(align_preds, words):
pred_data = []
for idx, pred in enumerate(align_preds):
if pred !=0 and pred != 2:
pred_data.append(words[idx])
else:
pred_data.append("[SEP]")
pred_data.append(words[idx])
data = "".join(pred_data)
return data
def align_words(preds, words_ids):
pred_res = []
word_ids = [w for w in words_ids if w != None]
previous_word_idx = None
for idx, word_id in enumerate(word_ids):
if word_id != previous_word_idx:
pred_res.append(preds[idx])
else:
continue
previous_word_idx = word_id
return pred_res
tokenizer = AutoTokenizer.from_pretrained('path', trust_remote_code=True)
model = AutoModelForTokenClassification.from_pretrained('path', trust_remote_code=True).cuda()
model.eval()
emr_data = "患者2年前无明显诱因出现左眼视力下降,无眼胀、眼痛,不伴畏光、流泪、异物感,无头疼、恶心、呕吐、头晕、头痛等症状,2014年于我院行左眼球后TENON囊下曲安奈德注药术,并行左眼视网膜激光凝固术。已多次于我科行左眼玻璃体腔药物注射术。治疗后视力有提高。近一周患者再次感视力下降,于2017年12月14日复查OCT提示左眼黄斑水肿明显。建议再次手术治疗。现为求进一步手术治疗,于2018年03月12日收入我院。患者自发病以来,神志清,精神可,饮食可,睡眠可,二便如常,体重无明显变化。无流感样症状。"
words = list(jieba.cut(emr_data, cut_all=False
words_remove_space = [i for i in words if i != ' ']
inputs = tokenizer(words, return_tensors="pt", is_split_into_words=True, ).to("cuda")
logits = model(**inputs).logits.squeeze(0)
preds = logits.argmax(-1)
preds = [p.item() for p, w in zip(preds, inputs.word_ids()) if w != None]
align_preds = align_words(preds, inputs.word_ids())
preds_with_space = align_space(words, words_remove_space, align_preds)
segmentation_data = segment(align_preds, words)
print(segmentation_data)
#[SEP]患者2年前无明显诱因出现左眼视力下降,无眼胀、眼痛,不伴畏光、流泪、异物感,无头疼、恶心、呕吐、头晕、头痛等症状,2014年于我院行左眼球后TENON囊下曲安奈德注药术,并行左眼视网膜激光凝固术。已多次于我科行左眼玻璃体腔药物注射术。治疗后视力有提高。
#[SEP]近一周患者再次感视力下降,于2017年12月14日复查OCT提示左眼黄斑水肿明显。建议再次手术治疗。现为求进一步手术治疗,于2018年03月12日收入我院。
#[SEP]患者自发病以来,神志清,精神可,饮食可,睡眠可,二便如常,体重无明显变化。无流感样症状。
- Downloads last month
- 15
This model does not have enough activity to be deployed to Inference API (serverless) yet. Increase its social
visibility and check back later, or deploy to Inference Endpoints (dedicated)
instead.