In [1]:
from vllm import LLM, SamplingParams
import pandas as pd
import numpy as np
import torch.nn.functional as F
import torch
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM
import re
import os
from transformers import pipeline, AutoModel
from torch.nn import functional as F
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn import LSTM, Linear, Embedding, Conv1d, MaxPool1d, GRU, LSTMCell, Dropout, Module, Sequential, ReLU


  from .autonotebook import tqdm as notebook_tqdm
2025-01-16 18:44:13,978	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

In [3]:
# here, pull in your raw patient clinical notes, imaging reports, and pathology reports
# your input file should contain at minimum columns like ['mrn', 'date', and 'text']; one row per clinical document
# you can combine notes from multiple patients into one input file as long as there is an mrn field
# this notebook expects MRNs to be called 'dfci_mrn, dates to be called 'date', and clinical text to be called 'text', so rename your columns accordingly
#all_reports = pd.read_csv("your_patient_notes_file_here.csv")


In [4]:
# this is how i pull reports for patients at dfci, commented out for public use


# prefix = '/data/clin_notes_outcomes/pan_dfci_2024/derived_data/'

# # pull in our large corpus of historical electronic health records data
# imaging = pd.read_parquet(prefix + 'all_imaging_reports.parquet')
# medonc = pd.read_parquet(prefix + 'all_clinical_notes.parquet')
# path = pd.read_parquet(prefix + 'all_path_reports.parquet')


# all_reports = pd.concat([imaging, medonc, path], axis=0).sort_values(by=['dfci_mrn','date']).reset_index(drop=True)


In [5]:
all_reports = all_reports.sort_values(by=['dfci_mrn','date']).reset_index(drop=True)


In [6]:
# these are the fields in the raw DFCI data, yours will differ
ten_sample_patients = all_reports.dfci_mrn.sample(n=10)
all_reports = all_reports[all_reports.dfci_mrn.isin(ten_sample_patients)]
all_reports.info()

<class 'pandas.core.frame.DataFrame'>
Index: 622 entries, 1627657 to 13607361
Data columns (total 9 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   dfci_mrn       622 non-null    int64         
 1   date           622 non-null    datetime64[ns]
 2   text           622 non-null    object        
 3   scan_type      283 non-null    object        
 4   split          622 non-null    object        
 5   note_type      622 non-null    object        
 6   department     268 non-null    object        
 7   provider_type  268 non-null    object        
 8   path_type      71 non-null     object        
dtypes: datetime64[ns](1), int64(1), object(7)
memory usage: 48.6+ KB


In [7]:
# the next set of cells works to extract useful information from each clinical note in your dataset, yielding one long history document for each patient

In [8]:
valid_tags_list = ['is_tagged','cancer_type','stage_at_diagnosis','treatment','cancer_burden','cancer_status','adverse_event','comorbidity','biomarker']
best_f1_thresholds = [-1.2996799,
 1.8744006,
 -0.90340906,
 -1.3298296,
 -1.3740511,
 -0.97108084,
 -1.0886533,
 -1.9212211,
 -0.7184834]



   
class TagModel(nn.Module):

    def __init__(self, num_tags, device):
        super(TagModel, self).__init__()
        
        self.bert = AutoModel.from_pretrained('prajjwal1/bert-tiny').to(device)

        self.prediction_heads = nn.ModuleList([Sequential(Linear(128, 128), ReLU(), Linear(128,1)).to(device) for x in range(0, num_tags)])
        

    def forward(self, x_text_tensor, x_attention_mask):
        
        main = self.bert(x_text_tensor, x_attention_mask)
        main = main.last_hidden_state[:,0,:].squeeze(1)

        outputs = [x(main) for x in self.prediction_heads]

        return outputs

num_valid_tags = len(valid_tags_list)
themodel = TagModel(num_valid_tags, device)
themodel.load_state_dict(torch.load('./tiny_bert_tagger_synthetic.pt'))
themodel.to(device)

themodel.eval()

  themodel.load_state_dict(torch.load('./tiny_bert_tagger_synthetic.pt'))


TagModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-1): 2 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=128, out_features=128, bias=True)
              (key): Linear(in_features=128, out_features=128, bias=True)
              (value): Linear(in_features=128, out_features=128, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=128, out_features=128, bias=True)
              (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine

In [9]:
from torch.utils import data
from transformers import AutoTokenizer

class UnlabeledTagDataset(data.Dataset):
    def __init__(self, pandas_dataset, valid_tags_list):
        self.data = pandas_dataset.copy().reset_index(drop=True)
        self.indices = self.data.index.unique()
        self.tokenizer = AutoTokenizer.from_pretrained('prajjwal1/bert-tiny', max_length=128, truncation_side='left')        
        self.valid_tags_list = valid_tags_list
        
    def __len__(self):
        # how many notes in the dataset
        return len(self.indices)
    
    def __getitem__(self, index):
        # get data for notes corresponding to indices passed
        this_index = self.indices[index]
        pand = self.data.loc[this_index, :]
    
        encoded = self.tokenizer(pand['excerpt'], padding='max_length', max_length=128, truncation=True)

        x_text_tensor = torch.tensor(encoded.input_ids, dtype=torch.long)
        x_attention_mask = torch.tensor(encoded.attention_mask, dtype=torch.long)
       

        return x_text_tensor, x_attention_mask
        
def extract_relevant_text_from_patient(patient_frame_original, valid_tags_list, best_f1_thresholds, tagger_model):
    num_valid_tags = len(valid_tags_list)
    patient_frame = patient_frame_original.copy()
    patient_frame['date'] = pd.to_datetime(patient_frame.date)
    patient_frame = patient_frame.sort_values(by='date').reset_index()
    chunk_frames = []
    for i in range(0, patient_frame.shape[0]):
        chunks = re.sub("\n|\r", " ", patient_frame.iloc[i].text.strip())
        chunks = re.sub(r'\s+', " ", chunks)
        chunks = "<excerpt break>" + re.sub("\\. ", "<excerpt break>", chunks) + "<excerpt break>"
        chunks = pd.Series(chunks.split("<excerpt break>")).str.strip()
        chunks = chunks[chunks != '']
    
        chunk_frame = pd.DataFrame({'date':patient_frame.iloc[i].date, 'note_type':patient_frame.iloc[i].note_type, 'excerpt':chunks})
        chunk_frames.append(chunk_frame)

    if len(chunk_frames) > 0:
        chunk_frames = pd.concat(chunk_frames, axis=0)
        chunk_frames = chunk_frames.drop_duplicates(subset=['excerpt'], keep='first')
    
        no_shuffle_valid_dataset = data.DataLoader(UnlabeledTagDataset(chunk_frames, valid_tags_list), batch_size=32, shuffle=False, num_workers=0)

        output_prediction_lists = [[] for x in range(num_valid_tags)]
        for batch in no_shuffle_valid_dataset:
            x_text_ids = batch[0].to(device)
            x_attention_mask = batch[1].to(device)
            with torch.no_grad():
                predictions = tagger_model(x_text_ids, x_attention_mask)
          
            for j in range(num_valid_tags):
                output_prediction_lists[j].append(predictions[j].squeeze(1).detach().cpu().numpy())
        
        output_prediction_lists = [np.concatenate(x) for x in output_prediction_lists]
        
        
        output = chunk_frames.copy()
        for x in range(num_valid_tags):
            output['outcome_' + str(x) + '_logit'] = output_prediction_lists[x]
    
        output = output[output.outcome_0_logit > best_f1_thresholds[0]]

        output = output.groupby(['date','note_type'])['excerpt'].agg('. '.join).reset_index()
        output = output[~output.excerpt.isnull()]
        output['date_text'] = output['date'].astype(str) + " " + output['note_type'] + " " + output['excerpt']
        return "\n".join(output.date_text.tolist())
    else:
        return ""
       

In [10]:
%%capture
# this generates a data frame with one row per patient, and a patient_long_text column with a bunch of relevant text extracted from each patient's notes

patient_list = []
unique_patients = all_reports.groupby('dfci_mrn').first().reset_index()[['dfci_mrn']]
for i in range(unique_patients.shape[0]):
    unique_patient = unique_patients.iloc[[i]]
    patient_frame = all_reports[all_reports.dfci_mrn == unique_patient.dfci_mrn.iloc[0]]
    if patient_frame.shape[0] > 0:
        # this next line is used for retrospective analysis to restrict input text to text predating a treatment start
        #patient_frame = patient_frame[pd.to_datetime(patient_frame.date) < patient_frame.treatment_start_date.iloc[0]]
        unique_patient['patient_long_text'] = extract_relevant_text_from_patient(patient_frame, valid_tags_list, best_f1_thresholds, themodel)
        patient_list.append(unique_patient)        

In [11]:
long_histories = pd.concat(patient_list, axis=0)
long_histories.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   dfci_mrn           10 non-null     int64 
 1   patient_long_text  10 non-null     object
dtypes: int64(1), object(1)
memory usage: 240.0+ bytes


In [12]:
# now you have long histories for each patient
# delete tiny bert tagging model to make room on GPU for llama
del themodel

In [13]:
# now get ready to use llama to summarize patient histories and extract trial spaces

In [14]:
# load llama
# modify this depending on your GPU setup and where you want to dowwnload the llm
# requires vllm
import os
from vllm import LLM
os.environ["CUDA_VISIBLE_DEVICES"]="2,3"
llama = LLM(model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', tensor_parallel_size = 2, download_dir = "../meta_ai/", gpu_memory_utilization=0.80, max_model_len=120000)

INFO 01-16 18:49:14 awq_marlin.py:97] The model is convertible to awq_marlin during runtime. Using awq_marlin kernel.
INFO 01-16 18:49:14 config.py:905] Defaulting to use mp for distributed inference
INFO 01-16 18:49:14 config.py:1021] Chunked prefill is enabled with max_num_batched_tokens=512.
INFO 01-16 18:49:14 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', speculative_config=None, tokenizer='hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=120000, download_dir='../meta_ai/', load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq_marlin, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, dev

Loading safetensors checkpoint shards:   0% Completed | 0/9 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  11% Completed | 1/9 [00:17<02:23, 17.88s/it]
Loading safetensors checkpoint shards:  22% Completed | 2/9 [01:02<03:57, 33.89s/it]
Loading safetensors checkpoint shards:  33% Completed | 3/9 [01:49<03:58, 39.79s/it]
Loading safetensors checkpoint shards:  44% Completed | 4/9 [02:36<03:32, 42.58s/it]
Loading safetensors checkpoint shards:  56% Completed | 5/9 [03:23<02:56, 44.20s/it]
Loading safetensors checkpoint shards:  67% Completed | 6/9 [03:56<02:00, 40.29s/it]
Loading safetensors checkpoint shards:  78% Completed | 7/9 [04:43<01:25, 42.53s/it]
Loading safetensors checkpoint shards:  89% Completed | 8/9 [05:30<00:43, 43.99s/it]
Loading safetensors checkpoint shards: 100% Completed | 9/9 [06:16<00:00, 44.57s/it]
Loading safetensors checkpoint shards: 100% Completed | 9/9 [06:16<00:00, 41.83s/it]



INFO 01-16 18:55:45 model_runner.py:1067] Loading model weights took 18.5818 GB
[1;36m(VllmWorkerProcess pid=2453819)[0;0m INFO 01-16 18:55:47 model_runner.py:1067] Loading model weights took 18.5807 GB
INFO 01-16 18:55:48 distributed_gpu_executor.py:57] # GPU blocks: 17638, # CPU blocks: 1638
INFO 01-16 18:55:48 distributed_gpu_executor.py:61] Maximum concurrency for 120000 tokens per request: 2.35x
[1;36m(VllmWorkerProcess pid=2453819)[0;0m INFO 01-16 18:55:53 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
[1;36m(VllmWorkerProcess pid=2453819)[0;0m INFO 01-16 18:55:53 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease

In [15]:
# generate summaries for our patients

In [16]:
def summarize_patients(patient_texts, llama_model):
    

    prompts = []

    tokenizer = llama_model.get_tokenizer()

    prompts = []
    for the_patient in patient_texts:

        patient_text_tokens = tokenizer(the_patient, add_special_tokens=False).input_ids
        if len(patient_text_tokens) > 115000:
            first_part = patient_text_tokens[:57500]
            # Slice the last `slice_size` elements
            last_part = patient_text_tokens[-57500:]
            # Concatenate the two slices
            patient_text_tokens = first_part + last_part
        patient_text = tokenizer.decode(patient_text_tokens)
    
        messages = [{'role':'system', 'content': """You are an experienced clinical oncology history summarization bot.
        Your job is to construct a summary of the cancer history for a patient based on an excerpt of the patient's electronic health record. The text in the excerpt is provided in chronological order.     
        Document the cancer type/primary site (eg breast cancer, lung cancer, etc); histology (eg adenocarcinoma, squamous carcinoma, etc); current extent (localized, advanced, metastatic, etc); biomarkers (genomic results, protein expression, etc); and treatment history (surgery, radiation, chemotherapy/targeted therapy/immunotherapy, etc, including start and stop dates and best response if known).
        Do not consider localized basal cell or squamous carcinomas of the skin, or colon polyps, to be cancers for your purposes.
        Do not include the patient's name, but do include relevant dates whenever documented, including dates of diagnosis and start/stop dates of each treatment.
        If a patient has a history of more than one cancer, document the cancers one at a time.
        """}, 
                    {'role':'user', 'content': "The excerpt is:\n" + the_patient + """Now, write your summary. Do not add preceding text before the abstraction, and do not add notes or commentary afterwards. This will not be used for clinical care, so do not write any disclaimers or cautionary notes."""}

                     ]
    


        prompts.append(messages)

    trunc_messages = [x[1]['content'] for x in prompts]

    newprompts = []
    for i, messages in enumerate(prompts):
        messages[1]['content'] = trunc_messages[i]
        template_prompt = tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False)
        newprompts.append(template_prompt)
        

    
    responses = llama_model.generate(
        newprompts,     
        SamplingParams(
        temperature=0.0,
        top_p=0.2,
        max_tokens=4096,
        repetition_penalty=1.2,
        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],  # KEYPOINT HERE
    ))

    response_texts = [x.outputs[0].text for x in responses]


    return responses, response_texts
    

In [17]:
long_histories['patient_summary'] = summarize_patients(long_histories.patient_long_text.tolist(), llama)[1]

Processed prompts: 100%|█████████| 10/10 [00:47<00:00,  4.71s/it, est. speed input: 925.94 toks/s, output: 42.00 toks/s]


In [18]:
# now we turn attention to the clinical trials we want to match against
# assume you have a dataset of trials, each with an eligibilty_criteria text field as from clinicaltrials.gov
# here, i just used a download from ct.gov for trials relating to cancer
trials = pd.read_csv('ctgov_cancer_trials.csv')

In [19]:
# ultimately you want to have a raw trial_text field that combines the trial title, summary, and eligibility criteria text from ct.gov
trials['trial_text'] = trials['title'] + "\n" + trials['brief_summary'] + "\n" + trials['eligibility_criteria']

In [20]:
# now summarize the trials of interest to you based on the trial_text field
def summarize_trials_multi_cohort(eligibility_texts, llama_model):

    tokenizer = llama.get_tokenizer()
    prompts = []
    for trial in eligibility_texts:
        messages = [
            {'role':'system', 'content': """You are an expert clinical oncologist with an encyclopedic knowledge of cancer and its treatments.
        Your job is to review a clinical trial document and extract a list of structured clinical spaces that are eligible for that trial.
        A clinical space is defined as a unique combination of cancer primary site, histology, which treatments a patient must have received, which treatments a patient must not have received, cancer burden (eg presence of metastatic disease), and tumor biomarkers (such as germline or somatic gene mutations or alterations, or protein expression on tumor) that a patient must have or must not have; that renders a patient eligible for the trial.
        Trials often specify that a particular treatment is excluded only if it was given within a short period of time, for example 14 days, one month, etc , prior to trial start. Do not include this type of time-specific treatment eligibility criteria in your output at all.
        Some trials have only one space, while others have several. Do not output a space that contains multiple cancer types and/or histologies. Instead, generate separate spaces for each cancer type/histology combination.
        For biomarkers, if the trial specifies whether the biomarker will be assessed during screening, note that.
        Spell out cancer types; do not abbreviate them. For example, write "non-small cell lung cancer" rather than "NSCLC".
        Structure your output like this, as a list of spaces, with spaces separated by newlines, as below:
        1. Cancer type allowed: <cancer_type_allowed>. Histology allowed: <histology_allowed>. Cancer burden allowed: <cancer_burden_allowed>. Prior treatment required: <prior_treatments_requred>. Prior treatment excluded: <prior_treatments_excluded>. Biomarkers required: <biomarkers_required>. Biomarkers excluded: <biomarkers_excluded>.
        2. Cancer type allowed: <cancer_type_allowed>, etc.
        If a particular concept is not mentioned in the trial text, do not include it in your definition of trial space(s).
        """},      
              
            {'role':'user', 'content': "Here is a clinical trial document: \n" + trial + "\n" + """Now, generate your list of the trial space(s), formatted as above.
            Do not provide any introductory, explanatory, concluding, or disclaimer text.
            Reminder: Treatment history is an important component of trial space definitions, but treatment history requirements that are described as applying only in a given period of time prior to trial treatment MUST BE IGNORED."""
            }
        ]
    
        prompts.append(tokenizer.apply_chat_template(conversation=messages, add_generation_prompt=True, tokenize=False))
    

    
    responses = llama_model.generate(
        prompts,   
        SamplingParams(
        temperature=0.0,
        top_p=0.9,
        max_tokens=3096,
        stop_token_ids=[tokenizer.eos_token_id, tokenizer.convert_tokens_to_ids("<|eot_id|>")],  # KEYPOINT HERE
    ))

    response_texts = [x.outputs[0].text for x in responses]


    return responses, response_texts

In [21]:
# this runs the trial summarization/space extraction
# i have a premade trial spaces file, so this is commented out
# trials['spaces'] = summarize_trials_multi_cohort(trials.trial_text.tolist(), llama)[1]
#trials.to_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')

In [22]:
trials = pd.read_csv('ctgov_all_trials_unique_trial_spaces_10-31-24.csv')

In [23]:
# now parse the extracted trial spaces to get one row per space (can be one or more rows per trial)

In [24]:
frames = []
for i in range(trials.shape[0]):
    cohorts = pd.Series(trials.iloc[i].spaces.split("\n"))
    cohorts = cohorts[~((cohorts.isnull()) | (cohorts == "\n") | (cohorts == ''))].reset_index(drop=True)
    frame = pd.DataFrame(np.repeat(trials.iloc[[i]], len(cohorts), axis=0), columns=trials.columns)
    frame['this_space'] = cohorts
    frame['space_number'] = frame.index
    frames.append(frame)
    

In [25]:
cohort_level_trials = pd.concat(frames, axis=0)

In [26]:
cohort_level_trials.info()

<class 'pandas.core.frame.DataFrame'>
Index: 38276 entries, 0 to 0
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Unnamed: 0.1          38276 non-null  object
 1   Unnamed: 0            38276 non-null  object
 2   nct_id                38276 non-null  object
 3   title                 38276 non-null  object
 4   brief_summary         38276 non-null  object
 5   eligibility_criteria  38276 non-null  object
 6   trial_text            38276 non-null  object
 7   spaces                38276 non-null  object
 8   this_space            38276 non-null  object
 9   space_number          38276 non-null  int64 
dtypes: int64(1), object(9)
memory usage: 3.2+ MB


In [27]:
cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9']).value_counts()

this_space
True     38140
False      136
Name: count, dtype: int64

In [28]:
trial_spaces = cohort_level_trials[cohort_level_trials.this_space.str[0].isin(['1','2','3','4','5','6','7','8','9'])]

In [29]:
# if you want to save the extracted individual trial 'spaces' do this
#trial_spaces.to_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [30]:
# this trial dataframe now has one row per trial 'space'; i have pre-generated it
trial_spaces = pd.read_csv('ctgov_all_trials_trial_space_lineitems_10-31-24.csv')

In [31]:
# now embed patients and trial spaces
from sentence_transformers import SentenceTransformer

# lazily using cpu here
embedding_model = SentenceTransformer('ksg-dfci/TrialSpace', trust_remote_code=True, device='cpu')


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.23it/s]


In [32]:
with torch.no_grad():
    patient_embeddings = embedding_model.encode(long_histories.patient_summary, convert_to_tensor=True)

In [33]:
# here's where we embed trial spaces
# this only needs to be run once to generate and save trial embeddings, or for a short list of trials you can run it every time
# here it is commented out, since I'll just load the previously generated embeddings

# with torch.no_grad():
#    trial_space_embeddings = embedding_model.encode(trial_spaces.this_space.tolist(), convert_to_tensor=True)

# from safetensors.torch import save_file
# output_trial_file = {"space_embeddings": trial_space_embeddings}
# save_file(output_trial_file, "trial_space_embeddings.safetensors")

# trial_space_embeddings.shape

In [34]:
# load trial space embeddings, should have same number of embeddings as there are in the trial spaces dataset
from safetensors import safe_open
with safe_open("trial_space_embeddings.safetensors", framework="pt", device='cpu') as f:
    trial_space_embeddings = f.get_tensor("space_embeddings")

In [35]:
trial_space_embeddings.shape, trial_spaces.shape

(torch.Size([38140, 1024]), (38140, 10))

In [36]:
# now let's find the top ten trial 'spaces' for each patient based on cosine similarity

output_list = []
for i, patient_summary in enumerate(long_histories.patient_summary):
    patient_embedding = patient_embeddings[i, :]
    similarities = F.cosine_similarity(patient_embedding, trial_space_embeddings)
    sorted_similarities, sorted_indices = torch.sort(similarities, descending=True)
    relevant_spaces = trial_spaces.iloc[sorted_indices[0:10].cpu().numpy()]
    output = pd.DataFrame({'patient_summary':patient_summary, 'this_space':relevant_spaces.this_space, 'nct_id':relevant_spaces.nct_id, 
                           'trial_title':relevant_spaces.title, 'trial_brief_summary':relevant_spaces.brief_summary,
                           'trial_text':relevant_spaces.trial_text})
    output_list.append(output)

output = pd.concat(output_list, axis=0).reset_index(drop=True)
output['patient_summary'] = output.patient_summary.str.strip()

In [37]:
# now run 'trial checker' classifier to double check the top (10) matches we have pulled

In [38]:
tokenizer = AutoTokenizer.from_pretrained("roberta-large")

checker_pipe = pipeline(
    'text-classification', 
    'ksg-dfci/TrialChecker', 
    tokenizer=tokenizer, 
    truncation=True, 
    padding='max_length', 
    max_length=512
)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [39]:

output['pt_trial_pair'] = (output['this_space'] + "\nNow here is the patient summary:" + output['patient_summary'])

classifier_results = checker_pipe(output['pt_trial_pair'].tolist())
output['trial_checker_result'] = [x['label'] for x in classifier_results]
output['trial_checker_score'] = [x['score'] for x in classifier_results]

In [40]:

output.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   patient_summary       100 non-null    object 
 1   this_space            100 non-null    object 
 2   nct_id                100 non-null    object 
 3   trial_title           100 non-null    object 
 4   trial_brief_summary   100 non-null    object 
 5   trial_text            100 non-null    object 
 6   pt_trial_pair         100 non-null    object 
 7   trial_checker_result  100 non-null    object 
 8   trial_checker_score   100 non-null    float64
dtypes: float64(1), object(8)
memory usage: 7.2+ KB


In [None]:
output