This-and-That / scripts /process_llama.py
HikariDawn777's picture
feat: initial push
59b2a81
'''
Process the llama file for the next step
'''
import os, shutil, sys
import json
import pandas as pd
import collections
if __name__ == "__main__":
# Define important path
json_path = "../SVD1/v1.jsonl"
folder_path = "/home/kiteret/Desktop/StableVideoDiffusion/full_text_tmp/"
# Read the json file
with open(json_path, 'r') as json_file:
json_list = list(json_file)
# Iterate all the json files
length_stats = collections.defaultdict(int)
for json_info in json_list:
json_info = json.loads(json_info)
# Define the path to write
key_start = len("/home/chfeng/llama3/full_text_tmp/")
key_end = len("lang.txt")
sub_path = json_info["file_path"][key_start:int(-1*key_end)]
new_text_path = os.path.join(folder_path, sub_path, "processed_text.txt")
if os.path.exists(new_text_path):
os.remove(new_text_path)
# Sanity check for the case where input is missed
if json_info["input"] == "":
print("It is weird for the input is empty in the LLM process for ", sub_path)
continue
# Re-Define the content
outputs = json_info["output"]
if outputs.find("action:") != 0:
print("It is weird for no actions: keyword in the outputs for ", sub_path, " with prompt ", outputs)
continue
# Prepare write file
contents = outputs.split('\n')
f = open(new_text_path, "a")
# Itearte
effective_length = 0
for idx, content in enumerate(contents):
key_word = content.split(":")[1][1:]
if key_word != "":
effective_length += 1
else:
if idx == 1:
print("It is abnormal for the this content to be empty ", sub_path, " with prompt ", outputs)
f.write(key_word + "\n")
# if effective_length == 2:
# print("short prompt case is ", sub_path, " with prompt ", outputs)
if effective_length < 2: # For those only 1 or zero, we won't consider them
print("The prompt is too short for ", sub_path, " with prompt ", outputs)
os.remove(new_text_path)
length_stats[effective_length] += 1
print("length_stats is ", length_stats)