Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	The translation model is now compatible with the
Browse files"Word Timestamps - Highlight Words" feature.
- app.py +32 -0
- docs/translateModel.md +14 -1
- src/utils.py +25 -9
    	
        app.py
    CHANGED
    
    | @@ -716,6 +716,38 @@ class WhisperTranscriber: | |
| 716 | 
             
                                segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
         | 
| 717 |  | 
| 718 | 
             
                            translationModel.release_vram()
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 719 | 
             
                            perf_end_time = time.perf_counter()
         | 
| 720 | 
             
                            # Call the finished callback
         | 
| 721 | 
             
                            if segments_progress_listener is not None:
         | 
|  | |
| 716 | 
             
                                segments_progress_listener.on_progress(idx+1, len(segments), desc=f"Process segments: {idx}/{len(segments)}")
         | 
| 717 |  | 
| 718 | 
             
                            translationModel.release_vram()
         | 
| 719 | 
            +
             | 
| 720 | 
            +
                            if highlight_words and segments[0]["words"] is not None:
         | 
| 721 | 
            +
                                for idx, segment in enumerate(segments):
         | 
| 722 | 
            +
                                    text = segment["text"]
         | 
| 723 | 
            +
                                    words = segment["words"]
         | 
| 724 | 
            +
                                    total_duration = words[-1]['end'] - words[0]['start'] #Calculate the total duration of the entire sentence
         | 
| 725 | 
            +
                                    total_text_length = len(text)
         | 
| 726 | 
            +
             | 
| 727 | 
            +
                                    # Allocate lengths to each word
         | 
| 728 | 
            +
                                    duration_ratio_lengths = []
         | 
| 729 | 
            +
                                    total_allocated = 0
         | 
| 730 | 
            +
                                    text_idx = 0  # Track the position in the translated string
         | 
| 731 | 
            +
                                    for word in words:
         | 
| 732 | 
            +
                                        # Calculate the duration of each word as a proportion of the total time
         | 
| 733 | 
            +
                                        word_duration = word['end'] - word['start']
         | 
| 734 | 
            +
                                        duration_ratio = word_duration / total_duration
         | 
| 735 | 
            +
                                        duration_ratio_length = int(duration_ratio * total_text_length)
         | 
| 736 | 
            +
                                        duration_ratio_lengths.append(duration_ratio_length)
         | 
| 737 | 
            +
                                        total_allocated += duration_ratio_length
         | 
| 738 | 
            +
             | 
| 739 | 
            +
                                    # Distribute remaining characters to avoid 0-duration_ratio_length issues
         | 
| 740 | 
            +
                                    remaining_chars = total_text_length - total_allocated
         | 
| 741 | 
            +
                                    for idx in range(remaining_chars):
         | 
| 742 | 
            +
                                        duration_ratio_lengths[idx % len(words)] += 1  # Distribute the remaining chars evenly
         | 
| 743 | 
            +
             | 
| 744 | 
            +
                                    # Generate translated words based on the calculated lengths
         | 
| 745 | 
            +
                                    text_idx = 0
         | 
| 746 | 
            +
                                    for idx, word in enumerate(words):
         | 
| 747 | 
            +
                                        text_part = text[text_idx:text_idx + duration_ratio_lengths[idx]]
         | 
| 748 | 
            +
                                        word["word"], word["word_original"] = text_part, word["word"]
         | 
| 749 | 
            +
                                        text_idx += duration_ratio_lengths[idx]
         | 
| 750 | 
            +
             | 
| 751 | 
             
                            perf_end_time = time.perf_counter()
         | 
| 752 | 
             
                            # Call the finished callback
         | 
| 753 | 
             
                            if segments_progress_listener is not None:
         | 
    	
        docs/translateModel.md
    CHANGED
    
    | @@ -5,7 +5,9 @@ The `translate` task in `Whisper` only supports translating other languages `int | |
| 5 |  | 
| 6 | 
             
            The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.  
         | 
| 7 |  | 
| 8 | 
            -
             | 
|  | |
|  | |
| 9 |  | 
| 10 |  | 
| 11 | 
             
            # Translation Model
         | 
| @@ -153,6 +155,17 @@ Automatic speech recognition (ASR) | |
| 153 | 
             
            | [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
         | 
| 154 | 
             
            | [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
         | 
| 155 |  | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 156 |  | 
| 157 | 
             
            # Options
         | 
| 158 |  | 
|  | |
| 5 |  | 
| 6 | 
             
            The larger the parameters of the Translation model, the better its translation capability is expected. However, this also requires higher computational resources and slower running speed.  
         | 
| 7 |  | 
| 8 | 
            +
            The translation model is now compatible with the `Word Timestamps - Highlight Words` feature.  
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            ~~Currently, when the `Highlight Words timestamps` option is enabled in the Whisper `Word Timestamps options`, it cannot be used simultaneously with the Translation Model. This is because Highlight Words splits the source text, and after translation, it becomes a non-word-level string.~~  
         | 
| 11 |  | 
| 12 |  | 
| 13 | 
             
            # Translation Model
         | 
|  | |
| 155 | 
             
            | [facebook/seamless-m4t-large](https://huggingface.co/facebook/seamless-m4t-large) | 2.3B | 11.4 GB | float32 | N/A |
         | 
| 156 | 
             
            | [facebook/seamless-m4t-v2-large](https://huggingface.co/facebook/seamless-m4t-v2-large) | 2.3B | 11.4 GB (safetensors:9.24 GB) | float32 | ≈9.2 GB |
         | 
| 157 |  | 
| 158 | 
            +
            ## Llama
         | 
| 159 | 
            +
             | 
| 160 | 
            +
            Meta developed and released the Meta Llama 3 family of large language models (LLMs). This program modifies them through prompts to function as translation models.
         | 
| 161 | 
            +
             | 
| 162 | 
            +
            | Name | Parameters | Size | type/quantize | Required VRAM |
         | 
| 163 | 
            +
            |------|------------|------|---------------|---------------|
         | 
| 164 | 
            +
            | [avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.2-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
         | 
| 165 | 
            +
            | [avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3.1-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
         | 
| 166 | 
            +
            | [avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16](https://huggingface.co/avans06/Meta-Llama-3-8B-Instruct-ct2-int8_float16) | 8B | 8.04 GB | int8_float16 | ≈ 7.9 GB |
         | 
| 167 | 
            +
            | [jncraton/Llama-3.2-3B-Instruct-ct2-int8](https://huggingface.co/jncraton/Llama-3.2-3B-Instruct-ct2-int8) | 3B | 3.22 GB | int8 | ≈ 3.3 GB |
         | 
| 168 | 
            +
             | 
| 169 |  | 
| 170 | 
             
            # Options
         | 
| 171 |  | 
    	
        src/utils.py
    CHANGED
    
    | @@ -155,7 +155,7 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 155 | 
             
                    subtitle_start = segment['start']
         | 
| 156 | 
             
                    subtitle_end   = segment['end']
         | 
| 157 | 
             
                    text           = segment['text'].strip()
         | 
| 158 | 
            -
                     | 
| 159 |  | 
| 160 | 
             
                    if len(words) == 0:
         | 
| 161 | 
             
                        # Prepend the longest speaker ID if available
         | 
| @@ -167,8 +167,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 167 | 
             
                            'end'  : subtitle_end,
         | 
| 168 | 
             
                            'text' : process_text(text, maxLineWidth)
         | 
| 169 | 
             
                        }
         | 
| 170 | 
            -
                        if  | 
| 171 | 
            -
                            result.update({'original': process_text( | 
| 172 | 
             
                        yield result
         | 
| 173 |  | 
| 174 | 
             
                        # We are done
         | 
| @@ -181,12 +181,14 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 181 | 
             
                            'end'  : subtitle_start,
         | 
| 182 | 
             
                            'word' : f"({segment_longest_speaker})"
         | 
| 183 | 
             
                        })
         | 
|  | |
|  | |
| 184 |  | 
| 185 | 
            -
                    text_words = [text] if not highlight_words and original_text is not None and len(original_text) > 0 else [ this_word["word"] for this_word in words ]
         | 
| 186 | 
             
                    subtitle_text = __join_words(text_words, maxLineWidth)
         | 
| 187 |  | 
| 188 | 
             
                    # Iterate over the words in the segment
         | 
| 189 | 
             
                    if highlight_words:
         | 
|  | |
| 190 | 
             
                        last = subtitle_start
         | 
| 191 |  | 
| 192 | 
             
                        for idx, this_word in enumerate(words):
         | 
| @@ -195,14 +197,17 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 195 |  | 
| 196 | 
             
                            if last != start:
         | 
| 197 | 
             
                                # Display the text up to this point
         | 
| 198 | 
            -
                                 | 
| 199 | 
             
                                    'start': last,
         | 
| 200 | 
             
                                    'end'  : start,
         | 
| 201 | 
             
                                    'text' : subtitle_text
         | 
| 202 | 
             
                                }
         | 
|  | |
|  | |
|  | |
| 203 |  | 
| 204 | 
             
                            # Display the text with the current word highlighted
         | 
| 205 | 
            -
                             | 
| 206 | 
             
                                'start': start,
         | 
| 207 | 
             
                                'end'  : end,
         | 
| 208 | 
             
                                'text' : __join_words(
         | 
| @@ -212,15 +217,26 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 212 | 
             
                                    ]
         | 
| 213 | 
             
                                    , maxLineWidth)
         | 
| 214 | 
             
                            }
         | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 215 | 
             
                            last = end
         | 
| 216 |  | 
| 217 | 
             
                        if last != subtitle_end:
         | 
| 218 | 
             
                            # Display the last part of the text
         | 
| 219 | 
            -
                             | 
| 220 | 
             
                                'start': last,
         | 
| 221 | 
             
                                'end'  : subtitle_end,
         | 
| 222 | 
             
                                'text' : subtitle_text
         | 
| 223 | 
             
                            }
         | 
|  | |
|  | |
|  | |
| 224 |  | 
| 225 | 
             
                    # Just return the subtitle text
         | 
| 226 | 
             
                    else:
         | 
| @@ -229,8 +245,8 @@ def __subtitle_preprocessor_iterator(transcript: Iterator[dict], maxLineWidth: i | |
| 229 | 
             
                            'end'  : subtitle_end,
         | 
| 230 | 
             
                            'text' : subtitle_text
         | 
| 231 | 
             
                        }
         | 
| 232 | 
            -
                        if  | 
| 233 | 
            -
                            result.update({'original': process_text( | 
| 234 | 
             
                        yield result
         | 
| 235 |  | 
| 236 | 
             
            def __join_words(words: Iterator[str], maxLineWidth: int = None):
         | 
|  | |
| 155 | 
             
                    subtitle_start = segment['start']
         | 
| 156 | 
             
                    subtitle_end   = segment['end']
         | 
| 157 | 
             
                    text           = segment['text'].strip()
         | 
| 158 | 
            +
                    text_original  = segment['original'].strip() if 'original' in segment else None
         | 
| 159 |  | 
| 160 | 
             
                    if len(words) == 0:
         | 
| 161 | 
             
                        # Prepend the longest speaker ID if available
         | 
|  | |
| 167 | 
             
                            'end'  : subtitle_end,
         | 
| 168 | 
             
                            'text' : process_text(text, maxLineWidth)
         | 
| 169 | 
             
                        }
         | 
| 170 | 
            +
                        if text_original is not None and len(text_original) > 0:
         | 
| 171 | 
            +
                            result.update({'original': process_text(text_original, maxLineWidth)})
         | 
| 172 | 
             
                        yield result
         | 
| 173 |  | 
| 174 | 
             
                        # We are done
         | 
|  | |
| 181 | 
             
                            'end'  : subtitle_start,
         | 
| 182 | 
             
                            'word' : f"({segment_longest_speaker})"
         | 
| 183 | 
             
                        })
         | 
| 184 | 
            +
                        
         | 
| 185 | 
            +
                    text_words = [text] if not highlight_words and text_original is not None and len(text_original) > 0 else [ this_word["word"] for this_word in words ]
         | 
| 186 |  | 
|  | |
| 187 | 
             
                    subtitle_text = __join_words(text_words, maxLineWidth)
         | 
| 188 |  | 
| 189 | 
             
                    # Iterate over the words in the segment
         | 
| 190 | 
             
                    if highlight_words:
         | 
| 191 | 
            +
                        text_words_original = [ this_word["word_original"] for this_word in words if "word_original" in this_word ] if text_original is not None and len(text_original) > 0 else None
         | 
| 192 | 
             
                        last = subtitle_start
         | 
| 193 |  | 
| 194 | 
             
                        for idx, this_word in enumerate(words):
         | 
|  | |
| 197 |  | 
| 198 | 
             
                            if last != start:
         | 
| 199 | 
             
                                # Display the text up to this point
         | 
| 200 | 
            +
                                result = {
         | 
| 201 | 
             
                                    'start': last,
         | 
| 202 | 
             
                                    'end'  : start,
         | 
| 203 | 
             
                                    'text' : subtitle_text
         | 
| 204 | 
             
                                }
         | 
| 205 | 
            +
                                if text_original is not None and len(text_original) > 0:
         | 
| 206 | 
            +
                                    result.update({'original': process_text(text_original, maxLineWidth)})
         | 
| 207 | 
            +
                                yield result
         | 
| 208 |  | 
| 209 | 
             
                            # Display the text with the current word highlighted
         | 
| 210 | 
            +
                            result = {
         | 
| 211 | 
             
                                'start': start,
         | 
| 212 | 
             
                                'end'  : end,
         | 
| 213 | 
             
                                'text' : __join_words(
         | 
|  | |
| 217 | 
             
                                    ]
         | 
| 218 | 
             
                                    , maxLineWidth)
         | 
| 219 | 
             
                            }
         | 
| 220 | 
            +
                            if text_words_original is not None and len(text_words_original) > 0:
         | 
| 221 | 
            +
                                result.update({'original': __join_words(
         | 
| 222 | 
            +
                                    [
         | 
| 223 | 
            +
                                        re.sub(r"^(\s*)(.*)$", r"\1<u>\2</u>", word_original) if subidx == idx else word_original
         | 
| 224 | 
            +
                                        for subidx, word_original in enumerate(text_words_original)
         | 
| 225 | 
            +
                                    ]
         | 
| 226 | 
            +
                                    , maxLineWidth)})
         | 
| 227 | 
            +
                            yield result
         | 
| 228 | 
             
                            last = end
         | 
| 229 |  | 
| 230 | 
             
                        if last != subtitle_end:
         | 
| 231 | 
             
                            # Display the last part of the text
         | 
| 232 | 
            +
                            result = {
         | 
| 233 | 
             
                                'start': last,
         | 
| 234 | 
             
                                'end'  : subtitle_end,
         | 
| 235 | 
             
                                'text' : subtitle_text
         | 
| 236 | 
             
                            }
         | 
| 237 | 
            +
                            if text_original is not None and len(text_original) > 0:
         | 
| 238 | 
            +
                                result.update({'original': process_text(text_original, maxLineWidth)})
         | 
| 239 | 
            +
                            yield result
         | 
| 240 |  | 
| 241 | 
             
                    # Just return the subtitle text
         | 
| 242 | 
             
                    else:
         | 
|  | |
| 245 | 
             
                            'end'  : subtitle_end,
         | 
| 246 | 
             
                            'text' : subtitle_text
         | 
| 247 | 
             
                        }
         | 
| 248 | 
            +
                        if text_original is not None and len(text_original) > 0:
         | 
| 249 | 
            +
                            result.update({'original': process_text(text_original, maxLineWidth)})
         | 
| 250 | 
             
                        yield result
         | 
| 251 |  | 
| 252 | 
             
            def __join_words(words: Iterator[str], maxLineWidth: int = None):
         | 
