Update processing_qwen2_ts.py
Browse files- processing_qwen2_ts.py +37 -4
processing_qwen2_ts.py
CHANGED
|
@@ -33,6 +33,7 @@ def sp_encoding(timeseries: np.ndarray, eots_token: bool = True) -> Tuple[np.nda
|
|
| 33 |
prompt (str): The placeholder string with offset and scaling info.
|
| 34 |
metadata (dict): Metadata containing the offset and scaling factor.
|
| 35 |
"""
|
|
|
|
| 36 |
mean = np.mean(timeseries)
|
| 37 |
scaled_timeseries = timeseries - mean
|
| 38 |
scale_factor = 1.0
|
|
@@ -57,12 +58,12 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
| 57 |
feature_extractor_class = None # You can add a feature extractor if needed
|
| 58 |
tokenizer_class = "AutoTokenizer"
|
| 59 |
|
| 60 |
-
def __init__(self, tokenizer=None):
|
| 61 |
"""
|
| 62 |
Args:
|
| 63 |
tokenizer: An optional tokenizer to process text prompts.
|
| 64 |
"""
|
| 65 |
-
super().__init__(tokenizer=tokenizer)
|
| 66 |
|
| 67 |
def __call__(
|
| 68 |
self,
|
|
@@ -71,6 +72,7 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
| 71 |
padding: Union[bool, str, PaddingStrategy] = False,
|
| 72 |
padding_side: str = 'left',
|
| 73 |
vllm_flag: bool = False,
|
|
|
|
| 74 |
**kwargs,
|
| 75 |
) -> BatchFeature:
|
| 76 |
"""
|
|
@@ -150,8 +152,10 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
| 150 |
|
| 151 |
# Tokenize the processed prompt
|
| 152 |
tokenizer_outputs = {}
|
| 153 |
-
if self.tokenizer is not None:
|
| 154 |
tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
|
|
|
|
|
|
|
| 155 |
|
| 156 |
# Create the final output
|
| 157 |
outputs = tokenizer_outputs
|
|
@@ -162,6 +166,35 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
| 162 |
|
| 163 |
return BatchFeature(data=outputs)
|
| 164 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
@property
|
| 166 |
def model_input_names(self):
|
| 167 |
"""
|
|
@@ -184,4 +217,4 @@ class Qwen2TSProcessor(ProcessorMixin):
|
|
| 184 |
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
| 185 |
the docstring of this method for more information.
|
| 186 |
"""
|
| 187 |
-
return self.tokenizer.decode(*args, **kwargs)
|
|
|
|
| 33 |
prompt (str): The placeholder string with offset and scaling info.
|
| 34 |
metadata (dict): Metadata containing the offset and scaling factor.
|
| 35 |
"""
|
| 36 |
+
timeseries = np.array(timeseries)
|
| 37 |
mean = np.mean(timeseries)
|
| 38 |
scaled_timeseries = timeseries - mean
|
| 39 |
scale_factor = 1.0
|
|
|
|
| 58 |
feature_extractor_class = None # You can add a feature extractor if needed
|
| 59 |
tokenizer_class = "AutoTokenizer"
|
| 60 |
|
| 61 |
+
def __init__(self, tokenizer=None, **kwargs):
|
| 62 |
"""
|
| 63 |
Args:
|
| 64 |
tokenizer: An optional tokenizer to process text prompts.
|
| 65 |
"""
|
| 66 |
+
super().__init__(tokenizer=tokenizer, **kwargs)
|
| 67 |
|
| 68 |
def __call__(
|
| 69 |
self,
|
|
|
|
| 72 |
padding: Union[bool, str, PaddingStrategy] = False,
|
| 73 |
padding_side: str = 'left',
|
| 74 |
vllm_flag: bool = False,
|
| 75 |
+
tokenize: bool = True,
|
| 76 |
**kwargs,
|
| 77 |
) -> BatchFeature:
|
| 78 |
"""
|
|
|
|
| 152 |
|
| 153 |
# Tokenize the processed prompt
|
| 154 |
tokenizer_outputs = {}
|
| 155 |
+
if tokenize and self.tokenizer is not None:
|
| 156 |
tokenizer_outputs = self.tokenizer(reconstructed_prompts, padding=padding, padding_side=padding_side, **kwargs)
|
| 157 |
+
else:
|
| 158 |
+
tokenizer_outputs = {"text": reconstructed_prompts}
|
| 159 |
|
| 160 |
# Create the final output
|
| 161 |
outputs = tokenizer_outputs
|
|
|
|
| 166 |
|
| 167 |
return BatchFeature(data=outputs)
|
| 168 |
|
| 169 |
+
def encode_timeseries(
|
| 170 |
+
self,
|
| 171 |
+
timeseries: Optional[List[List[np.ndarray]]] = None,
|
| 172 |
+
) -> np.ndarray:
|
| 173 |
+
if timeseries is None:
|
| 174 |
+
timeseries = []
|
| 175 |
+
|
| 176 |
+
concatenated_ts = None
|
| 177 |
+
encoded_ts_arrays = []
|
| 178 |
+
|
| 179 |
+
for i, ts in enumerate(timeseries):
|
| 180 |
+
encoded_ts, _, _ = sp_encoding(ts)
|
| 181 |
+
# Ensure time series shape [1, seq_len, feature_dim] for batch concatenation
|
| 182 |
+
encoded_ts_arrays.append(encoded_ts[None, ...])
|
| 183 |
+
|
| 184 |
+
if len(encoded_ts_arrays) > 0:
|
| 185 |
+
# Pad time series to the same length
|
| 186 |
+
max_length = max(ts.shape[1] for ts in encoded_ts_arrays)
|
| 187 |
+
padded_ts_arrays = [
|
| 188 |
+
np.pad(ts, ((0, 0), (0, max_length - ts.shape[1]), (0, 0)), mode="constant", constant_values=0.0)
|
| 189 |
+
for ts in encoded_ts_arrays
|
| 190 |
+
]
|
| 191 |
+
concatenated_ts = np.concatenate(padded_ts_arrays, axis=0) # Shape: [batch_size, max_length, feature_dim]
|
| 192 |
+
|
| 193 |
+
# Convert to torch
|
| 194 |
+
concatenated_ts = torch.from_numpy(concatenated_ts).half()
|
| 195 |
+
|
| 196 |
+
return concatenated_ts
|
| 197 |
+
|
| 198 |
@property
|
| 199 |
def model_input_names(self):
|
| 200 |
"""
|
|
|
|
| 217 |
This method forwards all its arguments to Qwen2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
| 218 |
the docstring of this method for more information.
|
| 219 |
"""
|
| 220 |
+
return self.tokenizer.decode(*args, **kwargs)
|