Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Fix: Add robust lang detection
Browse files
app.py
CHANGED
|
@@ -100,15 +100,42 @@ def _predict_np2_compat(self, text, k=1, threshold=0.0, on_unicode_error='strict
|
|
| 100 |
lid_model.predict = types.MethodType(_predict_np2_compat, lid_model)
|
| 101 |
|
| 102 |
### Check if lang is english #####################################################
|
| 103 |
-
def is_eng(
|
| 104 |
-
|
|
|
|
| 105 |
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
for prob, label in lang_preds:
|
| 109 |
-
if label == "__label__en":
|
| 110 |
-
return True, float(prob)
|
| 111 |
-
|
| 112 |
return False, 0.0
|
| 113 |
|
| 114 |
### Do actual prediction #########################################################
|
|
|
|
| 100 |
lid_model.predict = types.MethodType(_predict_np2_compat, lid_model)
|
| 101 |
|
| 102 |
### Check if lang is english #####################################################
|
| 103 |
+
def is_eng(text: str, k: int = 3, threshold: float = 0.1):
|
| 104 |
+
|
| 105 |
+
out = lid_model.predict(text, k=k)
|
| 106 |
|
| 107 |
+
# Normalisieren auf zwei Listen: labels[], probs[]
|
| 108 |
+
labels, probs = [], []
|
| 109 |
+
|
| 110 |
+
# Fall A: (labels, probs)
|
| 111 |
+
if isinstance(out, tuple) and len(out) == 2:
|
| 112 |
+
labels, probs = out
|
| 113 |
+
|
| 114 |
+
# Fall B: [(prob, '__label__xx'), ...]
|
| 115 |
+
elif (
|
| 116 |
+
isinstance(out, (list, tuple))
|
| 117 |
+
and len(out) > 0
|
| 118 |
+
and isinstance(out[0], (list, tuple))
|
| 119 |
+
and len(out[0]) == 2
|
| 120 |
+
and isinstance(out[0][1], str)
|
| 121 |
+
):
|
| 122 |
+
probs, labels = zip(*out) # entpacken
|
| 123 |
+
labels, probs = list(labels), list(probs)
|
| 124 |
+
|
| 125 |
+
# Fall C: ['__label__en', '__label__de', ...] (ohne Probs)
|
| 126 |
+
elif isinstance(out, (list, tuple)) and (len(out) == 0 or isinstance(out[0], str)):
|
| 127 |
+
labels = list(out)
|
| 128 |
+
probs = [1.0] * len(labels) # Dummy-Prob, falls nicht geliefert
|
| 129 |
+
|
| 130 |
+
else:
|
| 131 |
+
# Unbekanntes Format
|
| 132 |
+
return True, 0.0
|
| 133 |
+
|
| 134 |
+
if "__label__en" in labels:
|
| 135 |
+
i = labels.index("__label__en")
|
| 136 |
+
p = float(probs[i])
|
| 137 |
+
return (p >= threshold), p
|
| 138 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
return False, 0.0
|
| 140 |
|
| 141 |
### Do actual prediction #########################################################
|