Update README and scripts

Browse files

Files changed (4) hide show

README.md +12 -10
configuration_qwen2.py +0 -5
figures/chat_example.png +0 -0
modeling_qwen2.py +4 -83

README.md CHANGED Viewed

@@ -1,20 +1,22 @@
----
-license: apache-2.0
-base_model:
-- Qwen/Qwen2.5-14B-Instruct
----
 # ChatTS-14B Model
-This model is fine-tuned on the QWen2.5-14B-Instruct (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) model. For more usage details, please refer to the `README.md` in the ChatTS repository.
-# Reference
 - QWen2.5-14B-Instruct (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)
 - transformers (https://github.com/huggingface/transformers.git)
 - [ChatTS Paper](https://arxiv.org/pdf/2412.03104)
-# License
 This model is licensed under the [Apache License 2.0](LICENSE).
-# Cite
 ```
 @article{xie2024chatts,
   title={ChatTS: Aligning Time Series with LLMs via Synthetic Data for Enhanced Understanding and Reasoning},
@@ -22,4 +24,4 @@ This model is licensed under the [Apache License 2.0](LICENSE).
   journal={arXiv preprint arXiv:2412.03104},
   year={2024}
 }
-```

 # ChatTS-14B Model
+`ChatTS` focuses on **Understanding and Reasoning** about time series, much like what vision/video/audio-MLLMs do.
+This repo provides code, datasets and model for `ChatTS`: [ChatTS: Aligning Time Series with LLMs via Synthetic Data for Enhanced Understanding and Reasoning](https://arxiv.org/pdf/2412.03104).
+Here is an example of a ChatTS application, which allows users to interact with a LLM to understand and reason about time series data:
+![Chat](figures/chat_example.png)
+## Usage
+This model is fine-tuned on the QWen2.5-14B-Instruct (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) model. For more usage details, please refer to the `README.md` in the ChatTS repository.
+## Reference
 - QWen2.5-14B-Instruct (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct)
 - transformers (https://github.com/huggingface/transformers.git)
 - [ChatTS Paper](https://arxiv.org/pdf/2412.03104)
+## License
 This model is licensed under the [Apache License 2.0](LICENSE).
+## Cite
 ```
 @article{xie2024chatts,
   title={ChatTS: Aligning Time Series with LLMs via Synthetic Data for Enhanced Understanding and Reasoning},
   journal={arXiv preprint arXiv:2412.03104},
   year={2024}
 }
+```

configuration_qwen2.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # coding=utf-8
-# The following code are reused from the QWen project (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) of Alibaba Cloud.
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -13,10 +12,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# The code is modified by ByteDance and Tsinghua University from the original implementation of Qwen:
-# - We changed Qwen2Config to Qwen2TSConfig to support time series modeling.
 """ Qwen2 model configuration"""
 from transformers import PretrainedConfig

 # coding=utf-8
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ Qwen2 model configuration"""
 from transformers import PretrainedConfig

figures/chat_example.png ADDED Viewed

modeling_qwen2.py CHANGED Viewed

@@ -1,5 +1,4 @@
 # coding=utf-8
-# The following code are reused from the QWen project (https://huggingface.co/Qwen/Qwen2.5-14B-Instruct) of Alibaba Cloud.
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
@@ -18,10 +17,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-# The code is modified by ByteDance and Tsinghua University from the original implementation of Qwen:
-# - Support time series modality for Qwen2 model.
 """ PyTorch Qwen2 model."""
 import inspect
 import math
@@ -78,7 +73,6 @@ class TimeSeriesEmbedding(nn.Module):
         self.num_features = config['num_features']
         layers = []
-        # 调整输入大小以包含掩码通道
         input_size = 1 * self.patch_size
         for _ in range(self.num_layers - 1):
@@ -97,7 +91,6 @@ class TimeSeriesEmbedding(nn.Module):
         valid_lengths = mask.sum(dim=1).long()  # Shape: (batch_size)
         patch_cnt = (valid_lengths + self.patch_size - 1) // self.patch_size  # 向上取整
-        # print(f"[DEBUG] TimeSeriesEmbedding: {valid_lengths=}, {patch_cnt=}, {mask.shape=}")
         patches_list = []
         for i in range(batch_size):
@@ -118,9 +111,7 @@ class TimeSeriesEmbedding(nn.Module):
             x_patches = torch.cat(patches_list, dim=0)  # Shape: (total_patch_cnt, patch_size * num_features)
             x = self.mlp(x_patches)
         else:
-            # 如果没有有效的 patches，返回空 tensor
             x = torch.empty(0, self.hidden_size, device=x.device)
-        # print(f"[DEBUG] TimeSeriesEmbedding OUTPUT: {x.shape=}, {patch_cnt=}")
         return x, patch_cnt
@@ -1204,21 +1195,7 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         return num_special_ts_tokens * (num_patches - 2) + input_ids.size(1)
     def _get_original_length(self, timeseries, input_ids, past_length):
-        """
-        根据转换后的 past_length 计算对应的原始序列长度，并返回包含的 <ts> 标记数量。
-        Args:
-            timeseries (Tensor): 时间序列数据张量，形状为 (batch_size, num_time_steps)。
-            input_ids (Tensor): 原始输入 IDs 张量，形状为 (batch_size, seq_length)。
-            past_length (int 或 Tensor): 转换后的序列长度（包含插入的时间序列特征 token），可以是标量或形状为 (batch_size,) 的张量。
-        Returns:
-            Tuple[Tensor, Tensor]:
-                - original_length (Tensor): 每个样本对应的原始序列长度，形状为 (batch_size,)。
-                - num_special_ts_tokens_within_past (Tensor): 每个样本在 past_length 范围内包含的 <ts> 标记数量，形状为 (batch_size,)。
-        """
         if timeseries is None:
-            # 如果没有时间序列特征插入，原始长度等于 past_length
             if isinstance(past_length, int):
                 original_length = torch.full((input_ids.size(0),), past_length, dtype=torch.long, device=input_ids.device)
             else:
@@ -1226,45 +1203,32 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             num_special_ts_tokens_within_past = torch.zeros(input_ids.size(0), dtype=torch.long, device=input_ids.device)
             return original_length, num_special_ts_tokens_within_past
-        # 获取配置参数
         patch_size = self.config.ts['patch_size']
         num_patches = timeseries.size(1) * timeseries.size(2) // patch_size // self.config.ts['num_features']
         ts_token_start_index = self.config.ts_token_start_index
-        # 生成 mask，标识 <ts> token 的位置
         ts_mask = (input_ids == ts_token_start_index).long()  # (batch_size, seq_length)
-        # 计算每个位置之前的 <ts> token 数量的累积和
         cumsum_ts = torch.cumsum(ts_mask, dim=1)  # (batch_size, seq_length)
-        # 生成位置索引，从 1 开始
         seq_length = input_ids.size(1)
         positions = torch.arange(1, seq_length + 1, device=input_ids.device).unsqueeze(0).expand_as(input_ids)  # (batch_size, seq_length)
-        # 计算转换后的位置
         transformed_length = positions + cumsum_ts * (num_patches - 2)  # (batch_size, seq_length)
-        # 处理 past_length，可以是标量或张量
         if isinstance(past_length, int):
             past_length_tensor = torch.full((input_ids.size(0),), past_length, dtype=torch.long, device=input_ids.device)
         else:
             past_length_tensor = past_length.to(input_ids.device)
-        # 创建一个 mask，标识哪些原始位置在转换后不超过 past_length
         mask = transformed_length <= past_length_tensor.unsqueeze(1)  # (batch_size, seq_length)
-        # 对每个样本，计算满足条件的位置数量，即原始长度
         original_length = torch.sum(mask, dim=1)  # (batch_size,)
-        # 计算在 original_length 范围内包含的 <ts> 标记数量
-        # 生成一个 mask，标识 original_length 范围内的 <ts> token
-        # 首先生成一个位置索引
         original_positions = torch.arange(1, seq_length + 1, device=input_ids.device).unsqueeze(0).expand_as(input_ids)  # (batch_size, seq_length)
         original_mask = original_positions <= original_length.unsqueeze(1)  # (batch_size, seq_length)
         ts_within_original_mask = ts_mask.bool() & original_mask.bool()  # (batch_size, seq_length)
         num_special_ts_tokens_within_past = torch.sum(ts_within_original_mask, dim=1)  # (batch_size,)
-        # 确保 original_length 不为负数
         original_length = torch.clamp(original_length, min=0)
         return original_length, num_special_ts_tokens_within_past
@@ -1280,7 +1244,6 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         special_ts_token_mask_start = input_ids == self.config.ts_token_start_index
         special_ts_token_mask_end = input_ids == self.config.ts_token_end_index
         special_ts_token_mask = special_ts_token_mask_start | special_ts_token_mask_end
-        # print("Special ts token mask:", special_ts_token_mask)
         num_special_ts_tokens = torch.sum(special_ts_token_mask_start, dim=-1)
         # Correctly calculate the total number of patches per batch
         num_total_patches = torch.zeros(batch_size, dtype=patch_cnt.dtype, device=patch_cnt.device)
@@ -1291,8 +1254,8 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             num_ts_in_batch = num_special_ts_tokens[i]
             num_total_patches[i] = patch_cnt[patch_index:patch_index + num_ts_in_batch].sum() - 2 * num_ts_in_batch
             for idx in range(patch_index, patch_index + num_ts_in_batch):
-                batch_idx, seq_idx = special_ts_token_mask_start_nonzero[idx]
-                special_ts_token_mask_start_with_size[batch_idx, seq_idx] *= (patch_cnt[idx].item() - 2)
             patch_index += num_ts_in_batch
         # Compute the maximum embed dimension, considering both start and end tokens
@@ -1300,17 +1263,13 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         # batch_indices, non_ts_indices = torch.where(~special_ts_token_mask)
         batch_indices, non_ts_indices = torch.where(~special_ts_token_mask)
-        # print("non_ts_indices:", non_ts_indices)
-        # print("batch_indices:", batch_indices)
         # 2. Compute the positions where text should be written
         new_token_positions = torch.cumsum((special_ts_token_mask_start_with_size + 1), dim=-1) - 1
-        # print("new_token_positions", new_token_positions)
         nb_ts_pad = max_embed_dim - 1 - new_token_positions[:, -1]
         if left_padding:
             new_token_positions += nb_ts_pad[:, None]  # offset for left padding
         text_to_overwrite = new_token_positions[batch_indices, non_ts_indices]
-        # print('nb_ts_pad', nb_ts_pad)
         # 3. Create the full embedding, already padded to the maximum position
         final_embedding = torch.zeros(
@@ -1334,7 +1293,6 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
         # 4. Fill the embeddings based on the mask
         final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_ts_indices]
         final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_ts_indices]
-        # print('final_attention_mask=', final_attention_mask)
         if labels is not None:
             final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_ts_indices]
@@ -1343,11 +1301,8 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
         )
         ts_to_overwrite[batch_indices, text_to_overwrite] = False
-        # print('ts_to_overwrite.long().cumsum(-1) - 1=', ts_to_overwrite.long().cumsum(-1) - 1)
-        # print('nb_ts_pad=', nb_ts_pad[:, None])
         reversed_cumsum = ts_to_overwrite.flip(dims=[-1]).cumsum(-1).flip(dims=[-1]) - 1
         ts_to_overwrite &= reversed_cumsum >= nb_ts_pad[:, None].to(target_device)
-        # print('ts_to_overwrite=', ts_to_overwrite)
         if ts_to_overwrite.sum() != time_series_features.shape[:-1].numel():
             raise ValueError(
@@ -1356,7 +1311,6 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             )
         final_embedding[ts_to_overwrite] = time_series_features.contiguous().reshape(-1, embed_dim).to(target_device)
-        # logger.warning(f"[DEBUG] {final_embedding[ts_to_overwrite][:, 0]=}")
         final_attention_mask |= ts_to_overwrite
         position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
@@ -1423,47 +1377,16 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
             inputs_embeds = self.get_input_embeddings()(input_ids)
             if timeseries is not None and timeseries.shape[0] > 0:
                 use_cache = False
-                # print(f"[DEBUG] input timeseries.shape: {timeseries.shape}")
-                # 调用 ts_encoder，并打印输入和输出的形状
                 ts_features, patch_cnt = self.ts_encoder(timeseries)
-                # print(f"[DEBUG] ts_features.shape: {ts_features.shape}")
-                # print(f"[DEBUG] patch_cnt: {patch_cnt}")
                 inputs_embeds = inputs_embeds.to(ts_features.dtype)
-                # 在合并前打印相关形状
-                # print(f"[DEBUG] Before merging:")
-                # print(f"{inputs_embeds[0, -5:, :5]=}")
-                # print(f"{attention_mask.sum()=}")
-                # print(f"  inputs_embeds.shape: {inputs_embeds.shape}")
-                # print(f"  input_ids.shape: {input_ids.shape}")
-                # print(f"  attention_mask.shape: {attention_mask.shape}")
-                # if labels is not None:
-                #     print(f"  labels.shape: {labels.shape}")
-                # else:
-                #     print(f"  labels: None")
-                # print(f"  patch_cnt.shape: {patch_cnt.shape}")
-                # 调用 _merge_input_ids_with_time_series_features，并打印输出的形状
                 inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_time_series_features(
                     ts_features, inputs_embeds, input_ids, attention_mask, labels, patch_cnt
                 )
-                # print(f"[DEBUG] After merging:")
-                # print(f"  inputs_embeds.shape: {inputs_embeds.shape}")
-                # print(f"  attention_mask.shape: {attention_mask.shape}")
-                # print(f"{attention_mask.sum()=}")
-                # print(f"{inputs_embeds[0, -5:, :5]=}")
-                # print(f"  position_ids.shape: {position_ids.shape}")
-                # if labels is not None:
-                #     print(f"  labels.shape: {labels.shape}")
-                # else:
-                #     print(f"  labels: None")
-        # 继续模型的前向传播
         outputs = self.model(
             attention_mask=attention_mask,
             position_ids=position_ids,
@@ -1518,8 +1441,6 @@ class Qwen2TSForCausalLM(Qwen2PreTrainedModel):
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
-            # print(f"[prepare_inputs_for_generation] {cache_length=}, {past_length=}, {max_cache_length=}")
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as

 # coding=utf-8
 # Copyright 2024 The Qwen team, Alibaba Group and the HuggingFace Inc. team. All rights reserved.
 #
 # This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 """ PyTorch Qwen2 model."""
 import inspect
 import math
         self.num_features = config['num_features']
         layers = []
         input_size = 1 * self.patch_size
         for _ in range(self.num_layers - 1):
         valid_lengths = mask.sum(dim=1).long()  # Shape: (batch_size)
         patch_cnt = (valid_lengths + self.patch_size - 1) // self.patch_size  # 向上取整
         patches_list = []
         for i in range(batch_size):
             x_patches = torch.cat(patches_list, dim=0)  # Shape: (total_patch_cnt, patch_size * num_features)
             x = self.mlp(x_patches)
         else:
             x = torch.empty(0, self.hidden_size, device=x.device)
         return x, patch_cnt
         return num_special_ts_tokens * (num_patches - 2) + input_ids.size(1)
     def _get_original_length(self, timeseries, input_ids, past_length):
         if timeseries is None:
             if isinstance(past_length, int):
                 original_length = torch.full((input_ids.size(0),), past_length, dtype=torch.long, device=input_ids.device)
             else:
             num_special_ts_tokens_within_past = torch.zeros(input_ids.size(0), dtype=torch.long, device=input_ids.device)
             return original_length, num_special_ts_tokens_within_past
         patch_size = self.config.ts['patch_size']
         num_patches = timeseries.size(1) * timeseries.size(2) // patch_size // self.config.ts['num_features']
         ts_token_start_index = self.config.ts_token_start_index
         ts_mask = (input_ids == ts_token_start_index).long()  # (batch_size, seq_length)
         cumsum_ts = torch.cumsum(ts_mask, dim=1)  # (batch_size, seq_length)
         seq_length = input_ids.size(1)
         positions = torch.arange(1, seq_length + 1, device=input_ids.device).unsqueeze(0).expand_as(input_ids)  # (batch_size, seq_length)
         transformed_length = positions + cumsum_ts * (num_patches - 2)  # (batch_size, seq_length)
         if isinstance(past_length, int):
             past_length_tensor = torch.full((input_ids.size(0),), past_length, dtype=torch.long, device=input_ids.device)
         else:
             past_length_tensor = past_length.to(input_ids.device)
         mask = transformed_length <= past_length_tensor.unsqueeze(1)  # (batch_size, seq_length)
         original_length = torch.sum(mask, dim=1)  # (batch_size,)
         original_positions = torch.arange(1, seq_length + 1, device=input_ids.device).unsqueeze(0).expand_as(input_ids)  # (batch_size, seq_length)
         original_mask = original_positions <= original_length.unsqueeze(1)  # (batch_size, seq_length)
         ts_within_original_mask = ts_mask.bool() & original_mask.bool()  # (batch_size, seq_length)
         num_special_ts_tokens_within_past = torch.sum(ts_within_original_mask, dim=1)  # (batch_size,)
         original_length = torch.clamp(original_length, min=0)
         return original_length, num_special_ts_tokens_within_past
         special_ts_token_mask_start = input_ids == self.config.ts_token_start_index
         special_ts_token_mask_end = input_ids == self.config.ts_token_end_index
         special_ts_token_mask = special_ts_token_mask_start | special_ts_token_mask_end
         num_special_ts_tokens = torch.sum(special_ts_token_mask_start, dim=-1)
         # Correctly calculate the total number of patches per batch
         num_total_patches = torch.zeros(batch_size, dtype=patch_cnt.dtype, device=patch_cnt.device)
             num_ts_in_batch = num_special_ts_tokens[i]
             num_total_patches[i] = patch_cnt[patch_index:patch_index + num_ts_in_batch].sum() - 2 * num_ts_in_batch
             for idx in range(patch_index, patch_index + num_ts_in_batch):
+                batch_idx, pos_idx = special_ts_token_mask_start_nonzero[idx]
+                special_ts_token_mask_start_with_size[batch_idx, pos_idx] *= (patch_cnt[idx].item() - 2)
             patch_index += num_ts_in_batch
         # Compute the maximum embed dimension, considering both start and end tokens
         # batch_indices, non_ts_indices = torch.where(~special_ts_token_mask)
         batch_indices, non_ts_indices = torch.where(~special_ts_token_mask)
         # 2. Compute the positions where text should be written
         new_token_positions = torch.cumsum((special_ts_token_mask_start_with_size + 1), dim=-1) - 1
         nb_ts_pad = max_embed_dim - 1 - new_token_positions[:, -1]
         if left_padding:
             new_token_positions += nb_ts_pad[:, None]  # offset for left padding
         text_to_overwrite = new_token_positions[batch_indices, non_ts_indices]
         # 3. Create the full embedding, already padded to the maximum position
         final_embedding = torch.zeros(
         # 4. Fill the embeddings based on the mask
         final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_ts_indices]
         final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_ts_indices]
         if labels is not None:
             final_labels[batch_indices, text_to_overwrite] = labels[batch_indices, non_ts_indices]
             (batch_size, max_embed_dim), True, dtype=torch.bool, device=inputs_embeds.device
         )
         ts_to_overwrite[batch_indices, text_to_overwrite] = False
         reversed_cumsum = ts_to_overwrite.flip(dims=[-1]).cumsum(-1).flip(dims=[-1]) - 1
         ts_to_overwrite &= reversed_cumsum >= nb_ts_pad[:, None].to(target_device)
         if ts_to_overwrite.sum() != time_series_features.shape[:-1].numel():
             raise ValueError(
             )
         final_embedding[ts_to_overwrite] = time_series_features.contiguous().reshape(-1, embed_dim).to(target_device)
         final_attention_mask |= ts_to_overwrite
         position_ids = (final_attention_mask.cumsum(-1) - 1).masked_fill_((final_attention_mask == 0), 1)
             inputs_embeds = self.get_input_embeddings()(input_ids)
             if timeseries is not None and timeseries.shape[0] > 0:
+                # Disable KV Cache as it has not been implemented yet
                 use_cache = False
                 ts_features, patch_cnt = self.ts_encoder(timeseries)
                 inputs_embeds = inputs_embeds.to(ts_features.dtype)
                 inputs_embeds, attention_mask, position_ids, labels = self._merge_input_ids_with_time_series_features(
                     ts_features, inputs_embeds, input_ids, attention_mask, labels, patch_cnt
                 )
         outputs = self.model(
             attention_mask=attention_mask,
             position_ids=position_ids,
                 cache_length = past_length = past_key_values[0][0].shape[2]
                 max_cache_length = None
             # Keep only the unprocessed tokens:
             # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
             # some of the inputs are exclusively passed as part of the cache (e.g. when passing input_embeds as