|
|
import os
|
|
|
from typing import List
|
|
|
from typing import Union
|
|
|
|
|
|
import openai
|
|
|
import tiktoken
|
|
|
from dotenv import load_dotenv
|
|
|
from icecream import ic
|
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
|
|
|
|
|
|
|
|
load_dotenv()
|
|
|
client = openai.OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
|
|
|
|
MAX_TOKENS_PER_CHUNK = (
|
|
|
1000
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
def get_completion(
|
|
|
prompt: str,
|
|
|
system_message: str = "You are a helpful assistant.",
|
|
|
model: str = "gpt-4-turbo",
|
|
|
temperature: float = 0.3,
|
|
|
json_mode: bool = False,
|
|
|
) -> Union[str, dict]:
|
|
|
"""
|
|
|
Generate a completion using the OpenAI API.
|
|
|
|
|
|
Args:
|
|
|
prompt (str): The user's prompt or query.
|
|
|
system_message (str, optional): The system message to set the context for the assistant.
|
|
|
Defaults to "You are a helpful assistant.".
|
|
|
model (str, optional): The name of the OpenAI model to use for generating the completion.
|
|
|
Defaults to "gpt-4-turbo".
|
|
|
temperature (float, optional): The sampling temperature for controlling the randomness of the generated text.
|
|
|
Defaults to 0.3.
|
|
|
json_mode (bool, optional): Whether to return the response in JSON format.
|
|
|
Defaults to False.
|
|
|
|
|
|
Returns:
|
|
|
Union[str, dict]: The generated completion.
|
|
|
If json_mode is True, returns the complete API response as a dictionary.
|
|
|
If json_mode is False, returns the generated text as a string.
|
|
|
"""
|
|
|
|
|
|
if json_mode:
|
|
|
response = client.chat.completions.create(
|
|
|
model=model,
|
|
|
temperature=temperature,
|
|
|
top_p=1,
|
|
|
response_format={"type": "json_object"},
|
|
|
messages=[
|
|
|
{"role": "system", "content": system_message},
|
|
|
{"role": "user", "content": prompt},
|
|
|
],
|
|
|
)
|
|
|
return response.choices[0].message.content
|
|
|
else:
|
|
|
response = client.chat.completions.create(
|
|
|
model=model,
|
|
|
temperature=temperature,
|
|
|
top_p=1,
|
|
|
messages=[
|
|
|
{"role": "system", "content": system_message},
|
|
|
{"role": "user", "content": prompt},
|
|
|
],
|
|
|
)
|
|
|
return response.choices[0].message.content
|
|
|
|
|
|
|
|
|
def one_chunk_initial_translation(
|
|
|
source_lang: str, target_lang: str, source_text: str
|
|
|
) -> str:
|
|
|
"""
|
|
|
Translate the entire text as one chunk using an LLM.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language for translation.
|
|
|
source_text (str): The text to be translated.
|
|
|
|
|
|
Returns:
|
|
|
str: The translated text.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}."
|
|
|
|
|
|
translation_prompt = f"""This is an {source_lang} to {target_lang} translation, please provide the {target_lang} translation for this text. \
|
|
|
Do not provide any explanations or text apart from the translation.
|
|
|
{source_lang}: {source_text}
|
|
|
|
|
|
{target_lang}:"""
|
|
|
|
|
|
prompt = translation_prompt.format(source_text=source_text)
|
|
|
|
|
|
translation = get_completion(prompt, system_message=system_message)
|
|
|
|
|
|
return translation
|
|
|
|
|
|
|
|
|
def one_chunk_reflect_on_translation(
|
|
|
source_lang: str,
|
|
|
target_lang: str,
|
|
|
source_text: str,
|
|
|
translation_1: str,
|
|
|
country: str = "",
|
|
|
) -> str:
|
|
|
"""
|
|
|
Use an LLM to reflect on the translation, treating the entire text as one chunk.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language of the translation.
|
|
|
source_text (str): The original text in the source language.
|
|
|
translation_1 (str): The initial translation of the source text.
|
|
|
country (str): Country specified for target language.
|
|
|
|
|
|
Returns:
|
|
|
str: The LLM's reflection on the translation, providing constructive criticism and suggestions for improvement.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \
|
|
|
You will be provided with a source text and its translation and your goal is to improve the translation."
|
|
|
|
|
|
if country != "":
|
|
|
reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \
|
|
|
The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}.
|
|
|
|
|
|
The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows:
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{source_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
<TRANSLATION>
|
|
|
{translation_1}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's \n\
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
|
|
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
|
|
|
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation.
|
|
|
Each suggestion should address one specific part of the translation.
|
|
|
Output only the suggestions and nothing else."""
|
|
|
|
|
|
else:
|
|
|
reflection_prompt = f"""Your task is to carefully read a source text and a translation from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions to improve the translation. \
|
|
|
|
|
|
The source text and initial translation, delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT> and <TRANSLATION></TRANSLATION>, are as follows:
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{source_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
<TRANSLATION>
|
|
|
{translation_1}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's \n\
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
|
|
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
|
|
|
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation.
|
|
|
Each suggestion should address one specific part of the translation.
|
|
|
Output only the suggestions and nothing else."""
|
|
|
|
|
|
prompt = reflection_prompt.format(
|
|
|
source_lang=source_lang,
|
|
|
target_lang=target_lang,
|
|
|
source_text=source_text,
|
|
|
translation_1=translation_1,
|
|
|
)
|
|
|
reflection = get_completion(prompt, system_message=system_message)
|
|
|
return reflection
|
|
|
|
|
|
|
|
|
def one_chunk_improve_translation(
|
|
|
source_lang: str,
|
|
|
target_lang: str,
|
|
|
source_text: str,
|
|
|
translation_1: str,
|
|
|
reflection: str,
|
|
|
) -> str:
|
|
|
"""
|
|
|
Use the reflection to improve the translation, treating the entire text as one chunk.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language for the translation.
|
|
|
source_text (str): The original text in the source language.
|
|
|
translation_1 (str): The initial translation of the source text.
|
|
|
reflection (str): Expert suggestions and constructive criticism for improving the translation.
|
|
|
|
|
|
Returns:
|
|
|
str: The improved translation based on the expert suggestions.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}."
|
|
|
|
|
|
prompt = f"""Your task is to carefully read, then edit, a translation from {source_lang} to {target_lang}, taking into
|
|
|
account a list of expert suggestions and constructive criticisms.
|
|
|
|
|
|
The source text, the initial translation, and the expert linguist suggestions are delimited by XML tags <SOURCE_TEXT></SOURCE_TEXT>, <TRANSLATION></TRANSLATION> and <EXPERT_SUGGESTIONS></EXPERT_SUGGESTIONS> \
|
|
|
as follows:
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{source_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
<TRANSLATION>
|
|
|
{translation_1}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
<EXPERT_SUGGESTIONS>
|
|
|
{reflection}
|
|
|
</EXPERT_SUGGESTIONS>
|
|
|
|
|
|
Please take into account the expert suggestions when editing the translation. Edit the translation by ensuring:
|
|
|
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text)
|
|
|
(iv) terminology (inappropriate for context, inconsistent use), or
|
|
|
(v) other errors.
|
|
|
|
|
|
Output only the new translation and nothing else."""
|
|
|
|
|
|
translation_2 = get_completion(prompt, system_message)
|
|
|
|
|
|
return translation_2
|
|
|
|
|
|
|
|
|
def one_chunk_translate_text(
|
|
|
source_lang: str, target_lang: str, source_text: str, country: str = ""
|
|
|
) -> str:
|
|
|
"""
|
|
|
Translate a single chunk of text from the source language to the target language.
|
|
|
|
|
|
This function performs a two-step translation process:
|
|
|
1. Get an initial translation of the source text.
|
|
|
2. Reflect on the initial translation and generate an improved translation.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language for the translation.
|
|
|
source_text (str): The text to be translated.
|
|
|
country (str): Country specified for target language.
|
|
|
Returns:
|
|
|
str: The improved translation of the source text.
|
|
|
"""
|
|
|
translation_1 = one_chunk_initial_translation(
|
|
|
source_lang, target_lang, source_text
|
|
|
)
|
|
|
|
|
|
reflection = one_chunk_reflect_on_translation(
|
|
|
source_lang, target_lang, source_text, translation_1, country
|
|
|
)
|
|
|
translation_2 = one_chunk_improve_translation(
|
|
|
source_lang, target_lang, source_text, translation_1, reflection
|
|
|
)
|
|
|
|
|
|
return translation_2
|
|
|
|
|
|
|
|
|
def num_tokens_in_string(
|
|
|
input_str: str, encoding_name: str = "cl100k_base"
|
|
|
) -> int:
|
|
|
"""
|
|
|
Calculate the number of tokens in a given string using a specified encoding.
|
|
|
|
|
|
Args:
|
|
|
str (str): The input string to be tokenized.
|
|
|
encoding_name (str, optional): The name of the encoding to use. Defaults to "cl100k_base",
|
|
|
which is the most commonly used encoder (used by GPT-4).
|
|
|
|
|
|
Returns:
|
|
|
int: The number of tokens in the input string.
|
|
|
|
|
|
Example:
|
|
|
>>> text = "Hello, how are you?"
|
|
|
>>> num_tokens = num_tokens_in_string(text)
|
|
|
>>> print(num_tokens)
|
|
|
5
|
|
|
"""
|
|
|
encoding = tiktoken.get_encoding(encoding_name)
|
|
|
num_tokens = len(encoding.encode(input_str))
|
|
|
return num_tokens
|
|
|
|
|
|
|
|
|
def multichunk_initial_translation(
|
|
|
source_lang: str, target_lang: str, source_text_chunks: List[str]
|
|
|
) -> List[str]:
|
|
|
"""
|
|
|
Translate a text in multiple chunks from the source language to the target language.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language for translation.
|
|
|
source_text_chunks (List[str]): A list of text chunks to be translated.
|
|
|
|
|
|
Returns:
|
|
|
List[str]: A list of translated text chunks.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist, specializing in translation from {source_lang} to {target_lang}."
|
|
|
|
|
|
translation_prompt = """Your task is provide a professional translation from {source_lang} to {target_lang} of PART of a text.
|
|
|
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>. Translate only the part within the source text
|
|
|
delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS>. You can use the rest of the source text as context, but do not translate any
|
|
|
of the other text. Do not output anything other than the translation of the indicated part of the text.
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{tagged_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
To reiterate, you should translate only this part of the text, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
|
|
|
<TRANSLATE_THIS>
|
|
|
{chunk_to_translate}
|
|
|
</TRANSLATE_THIS>
|
|
|
|
|
|
Output only the translation of the portion you are asked to translate, and nothing else.
|
|
|
"""
|
|
|
|
|
|
translation_chunks = []
|
|
|
for i in range(len(source_text_chunks)):
|
|
|
|
|
|
tagged_text = (
|
|
|
"".join(source_text_chunks[0:i])
|
|
|
+ "<TRANSLATE_THIS>"
|
|
|
+ source_text_chunks[i]
|
|
|
+ "</TRANSLATE_THIS>"
|
|
|
+ "".join(source_text_chunks[i + 1 :])
|
|
|
)
|
|
|
|
|
|
prompt = translation_prompt.format(
|
|
|
source_lang=source_lang,
|
|
|
target_lang=target_lang,
|
|
|
tagged_text=tagged_text,
|
|
|
chunk_to_translate=source_text_chunks[i],
|
|
|
)
|
|
|
|
|
|
translation = get_completion(prompt, system_message=system_message)
|
|
|
translation_chunks.append(translation)
|
|
|
|
|
|
return translation_chunks
|
|
|
|
|
|
|
|
|
def multichunk_reflect_on_translation(
|
|
|
source_lang: str,
|
|
|
target_lang: str,
|
|
|
source_text_chunks: List[str],
|
|
|
translation_1_chunks: List[str],
|
|
|
country: str = "",
|
|
|
) -> List[str]:
|
|
|
"""
|
|
|
Provides constructive criticism and suggestions for improving a partial translation.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language of the translation.
|
|
|
source_text_chunks (List[str]): The source text divided into chunks.
|
|
|
translation_1_chunks (List[str]): The translated chunks corresponding to the source text chunks.
|
|
|
country (str): Country specified for target language.
|
|
|
|
|
|
Returns:
|
|
|
List[str]: A list of reflections containing suggestions for improving each translated chunk.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist specializing in translation from {source_lang} to {target_lang}. \
|
|
|
You will be provided with a source text and its translation and your goal is to improve the translation."
|
|
|
|
|
|
if country != "":
|
|
|
reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation.
|
|
|
The final style and tone of the translation should match the style of {target_lang} colloquially spoken in {country}.
|
|
|
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
|
|
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
|
|
|
as context for critiquing the translated part.
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{tagged_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
|
|
|
<TRANSLATE_THIS>
|
|
|
{chunk_to_translate}
|
|
|
</TRANSLATE_THIS>
|
|
|
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
|
|
|
<TRANSLATION>
|
|
|
{translation_1_chunk}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
|
|
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
|
|
|
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation.
|
|
|
Each suggestion should address one specific part of the translation.
|
|
|
Output only the suggestions and nothing else."""
|
|
|
|
|
|
else:
|
|
|
reflection_prompt = """Your task is to carefully read a source text and part of a translation of that text from {source_lang} to {target_lang}, and then give constructive criticism and helpful suggestions for improving the translation.
|
|
|
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
|
|
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
|
|
|
as context for critiquing the translated part.
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{tagged_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
|
|
|
<TRANSLATE_THIS>
|
|
|
{chunk_to_translate}
|
|
|
</TRANSLATE_THIS>
|
|
|
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
|
|
|
<TRANSLATION>
|
|
|
{translation_1_chunk}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
When writing suggestions, pay attention to whether there are ways to improve the translation's:\n\
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),\n\
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules, and ensuring there are no unnecessary repetitions),\n\
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text and takes into account any cultural context),\n\
|
|
|
(iv) terminology (by ensuring terminology use is consistent and reflects the source text domain; and by only ensuring you use equivalent idioms {target_lang}).\n\
|
|
|
|
|
|
Write a list of specific, helpful and constructive suggestions for improving the translation.
|
|
|
Each suggestion should address one specific part of the translation.
|
|
|
Output only the suggestions and nothing else."""
|
|
|
|
|
|
reflection_chunks = []
|
|
|
for i in range(len(source_text_chunks)):
|
|
|
|
|
|
tagged_text = (
|
|
|
"".join(source_text_chunks[0:i])
|
|
|
+ "<TRANSLATE_THIS>"
|
|
|
+ source_text_chunks[i]
|
|
|
+ "</TRANSLATE_THIS>"
|
|
|
+ "".join(source_text_chunks[i + 1 :])
|
|
|
)
|
|
|
if country != "":
|
|
|
prompt = reflection_prompt.format(
|
|
|
source_lang=source_lang,
|
|
|
target_lang=target_lang,
|
|
|
tagged_text=tagged_text,
|
|
|
chunk_to_translate=source_text_chunks[i],
|
|
|
translation_1_chunk=translation_1_chunks[i],
|
|
|
country=country,
|
|
|
)
|
|
|
else:
|
|
|
prompt = reflection_prompt.format(
|
|
|
source_lang=source_lang,
|
|
|
target_lang=target_lang,
|
|
|
tagged_text=tagged_text,
|
|
|
chunk_to_translate=source_text_chunks[i],
|
|
|
translation_1_chunk=translation_1_chunks[i],
|
|
|
)
|
|
|
|
|
|
reflection = get_completion(prompt, system_message=system_message)
|
|
|
reflection_chunks.append(reflection)
|
|
|
|
|
|
return reflection_chunks
|
|
|
|
|
|
|
|
|
def multichunk_improve_translation(
|
|
|
source_lang: str,
|
|
|
target_lang: str,
|
|
|
source_text_chunks: List[str],
|
|
|
translation_1_chunks: List[str],
|
|
|
reflection_chunks: List[str],
|
|
|
) -> List[str]:
|
|
|
"""
|
|
|
Improves the translation of a text from source language to target language by considering expert suggestions.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text.
|
|
|
target_lang (str): The target language for translation.
|
|
|
source_text_chunks (List[str]): The source text divided into chunks.
|
|
|
translation_1_chunks (List[str]): The initial translation of each chunk.
|
|
|
reflection_chunks (List[str]): Expert suggestions for improving each translated chunk.
|
|
|
|
|
|
Returns:
|
|
|
List[str]: The improved translation of each chunk.
|
|
|
"""
|
|
|
|
|
|
system_message = f"You are an expert linguist, specializing in translation editing from {source_lang} to {target_lang}."
|
|
|
|
|
|
improvement_prompt = """Your task is to carefully read, then improve, a translation from {source_lang} to {target_lang}, taking into
|
|
|
account a set of expert suggestions and constructive criticisms. Below, the source text, initial translation, and expert suggestions are provided.
|
|
|
|
|
|
The source text is below, delimited by XML tags <SOURCE_TEXT> and </SOURCE_TEXT>, and the part that has been translated
|
|
|
is delimited by <TRANSLATE_THIS> and </TRANSLATE_THIS> within the source text. You can use the rest of the source text
|
|
|
as context, but need to provide a translation only of the part indicated by <TRANSLATE_THIS> and </TRANSLATE_THIS>.
|
|
|
|
|
|
<SOURCE_TEXT>
|
|
|
{tagged_text}
|
|
|
</SOURCE_TEXT>
|
|
|
|
|
|
To reiterate, only part of the text is being translated, shown here again between <TRANSLATE_THIS> and </TRANSLATE_THIS>:
|
|
|
<TRANSLATE_THIS>
|
|
|
{chunk_to_translate}
|
|
|
</TRANSLATE_THIS>
|
|
|
|
|
|
The translation of the indicated part, delimited below by <TRANSLATION> and </TRANSLATION>, is as follows:
|
|
|
<TRANSLATION>
|
|
|
{translation_1_chunk}
|
|
|
</TRANSLATION>
|
|
|
|
|
|
The expert translations of the indicated part, delimited below by <EXPERT_SUGGESTIONS> and </EXPERT_SUGGESTIONS>, is as follows:
|
|
|
<EXPERT_SUGGESTIONS>
|
|
|
{reflection_chunk}
|
|
|
</EXPERT_SUGGESTIONS>
|
|
|
|
|
|
Taking into account the expert suggestions rewrite the translation to improve it, paying attention
|
|
|
to whether there are ways to improve the translation's
|
|
|
|
|
|
(i) accuracy (by correcting errors of addition, mistranslation, omission, or untranslated text),
|
|
|
(ii) fluency (by applying {target_lang} grammar, spelling and punctuation rules and ensuring there are no unnecessary repetitions), \
|
|
|
(iii) style (by ensuring the translations reflect the style of the source text)
|
|
|
(iv) terminology (inappropriate for context, inconsistent use), or
|
|
|
(v) other errors.
|
|
|
|
|
|
Output only the new translation of the indicated part and nothing else."""
|
|
|
|
|
|
translation_2_chunks = []
|
|
|
for i in range(len(source_text_chunks)):
|
|
|
|
|
|
tagged_text = (
|
|
|
"".join(source_text_chunks[0:i])
|
|
|
+ "<TRANSLATE_THIS>"
|
|
|
+ source_text_chunks[i]
|
|
|
+ "</TRANSLATE_THIS>"
|
|
|
+ "".join(source_text_chunks[i + 1 :])
|
|
|
)
|
|
|
|
|
|
prompt = improvement_prompt.format(
|
|
|
source_lang=source_lang,
|
|
|
target_lang=target_lang,
|
|
|
tagged_text=tagged_text,
|
|
|
chunk_to_translate=source_text_chunks[i],
|
|
|
translation_1_chunk=translation_1_chunks[i],
|
|
|
reflection_chunk=reflection_chunks[i],
|
|
|
)
|
|
|
|
|
|
translation_2 = get_completion(prompt, system_message=system_message)
|
|
|
translation_2_chunks.append(translation_2)
|
|
|
|
|
|
return translation_2_chunks
|
|
|
|
|
|
|
|
|
def multichunk_translation(
|
|
|
source_lang, target_lang, source_text_chunks, country: str = ""
|
|
|
):
|
|
|
"""
|
|
|
Improves the translation of multiple text chunks based on the initial translation and reflection.
|
|
|
|
|
|
Args:
|
|
|
source_lang (str): The source language of the text chunks.
|
|
|
target_lang (str): The target language for translation.
|
|
|
source_text_chunks (List[str]): The list of source text chunks to be translated.
|
|
|
translation_1_chunks (List[str]): The list of initial translations for each source text chunk.
|
|
|
reflection_chunks (List[str]): The list of reflections on the initial translations.
|
|
|
country (str): Country specified for target language
|
|
|
Returns:
|
|
|
List[str]: The list of improved translations for each source text chunk.
|
|
|
"""
|
|
|
|
|
|
translation_1_chunks = multichunk_initial_translation(
|
|
|
source_lang, target_lang, source_text_chunks
|
|
|
)
|
|
|
|
|
|
reflection_chunks = multichunk_reflect_on_translation(
|
|
|
source_lang,
|
|
|
target_lang,
|
|
|
source_text_chunks,
|
|
|
translation_1_chunks,
|
|
|
country,
|
|
|
)
|
|
|
|
|
|
translation_2_chunks = multichunk_improve_translation(
|
|
|
source_lang,
|
|
|
target_lang,
|
|
|
source_text_chunks,
|
|
|
translation_1_chunks,
|
|
|
reflection_chunks,
|
|
|
)
|
|
|
|
|
|
return translation_2_chunks
|
|
|
|
|
|
|
|
|
def calculate_chunk_size(token_count: int, token_limit: int) -> int:
|
|
|
"""
|
|
|
Calculate the chunk size based on the token count and token limit.
|
|
|
|
|
|
Args:
|
|
|
token_count (int): The total number of tokens.
|
|
|
token_limit (int): The maximum number of tokens allowed per chunk.
|
|
|
|
|
|
Returns:
|
|
|
int: The calculated chunk size.
|
|
|
|
|
|
Description:
|
|
|
This function calculates the chunk size based on the given token count and token limit.
|
|
|
If the token count is less than or equal to the token limit, the function returns the token count as the chunk size.
|
|
|
Otherwise, it calculates the number of chunks needed to accommodate all the tokens within the token limit.
|
|
|
The chunk size is determined by dividing the token limit by the number of chunks.
|
|
|
If there are remaining tokens after dividing the token count by the token limit,
|
|
|
the chunk size is adjusted by adding the remaining tokens divided by the number of chunks.
|
|
|
|
|
|
Example:
|
|
|
>>> calculate_chunk_size(1000, 500)
|
|
|
500
|
|
|
>>> calculate_chunk_size(1530, 500)
|
|
|
389
|
|
|
>>> calculate_chunk_size(2242, 500)
|
|
|
496
|
|
|
"""
|
|
|
|
|
|
if token_count <= token_limit:
|
|
|
return token_count
|
|
|
|
|
|
num_chunks = (token_count + token_limit - 1) // token_limit
|
|
|
chunk_size = token_count // num_chunks
|
|
|
|
|
|
remaining_tokens = token_count % token_limit
|
|
|
if remaining_tokens > 0:
|
|
|
chunk_size += remaining_tokens // num_chunks
|
|
|
|
|
|
return chunk_size
|
|
|
|
|
|
|
|
|
def translate(
|
|
|
source_lang,
|
|
|
target_lang,
|
|
|
source_text,
|
|
|
country,
|
|
|
max_tokens=MAX_TOKENS_PER_CHUNK,
|
|
|
):
|
|
|
"""Translate the source_text from source_lang to target_lang."""
|
|
|
|
|
|
num_tokens_in_text = num_tokens_in_string(source_text)
|
|
|
|
|
|
ic(num_tokens_in_text)
|
|
|
|
|
|
if num_tokens_in_text < max_tokens:
|
|
|
ic("Translating text as single chunk")
|
|
|
|
|
|
final_translation = one_chunk_translate_text(
|
|
|
source_lang, target_lang, source_text, country
|
|
|
)
|
|
|
|
|
|
return final_translation
|
|
|
|
|
|
else:
|
|
|
ic("Translating text as multiple chunks")
|
|
|
|
|
|
token_size = calculate_chunk_size(
|
|
|
token_count=num_tokens_in_text, token_limit=max_tokens
|
|
|
)
|
|
|
|
|
|
ic(token_size)
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
|
|
|
model_name="gpt-4",
|
|
|
chunk_size=token_size,
|
|
|
chunk_overlap=0,
|
|
|
)
|
|
|
|
|
|
source_text_chunks = text_splitter.split_text(source_text)
|
|
|
|
|
|
translation_2_chunks = multichunk_translation(
|
|
|
source_lang, target_lang, source_text_chunks, country
|
|
|
)
|
|
|
|
|
|
return "".join(translation_2_chunks)
|
|
|
|