import pickle import numpy as np import os import scipy.sparse as sp import torch from scipy.sparse import linalg from torch.autograd import Variable import pandas as pd def mape_loss(target, input): return (torch.abs(input - target) / (torch.abs(target) + 1e-2)).mean() * 100 def MAPE(y_true, y_pre): y_true = (y_true).reshape((-1, 1)) y_pre = (y_pre).reshape((-1, 1)) # e = (y_true + y_pre) / 2 + 1e-2 # re = (np.abs(y_true - y_pre) / (np.abs(y_true) + e)).mean() re = np.mean(np.abs((y_true - y_pre) / y_true)) * 100 return re def normal_std(x): return x.std() * np.sqrt((len(x) - 1.) / (len(x))) class DataLoaderS(object): # train and valid is the ratio of training set and validation set. test = 1 - train - valid def __init__(self, file_name, train, valid, device, horizon, window, normalize=2): self.device = device self.P = window self.h = horizon fin = open(file_name) self.rawdat = np.loadtxt(fin, delimiter=',', skiprows=1) self.dat = np.zeros(self.rawdat.shape) self.n, self.m = self.dat.shape self.scale_mean = np.ones(self.m) self.scale_std = np.ones(self.m) self.train_size = train # self.scale = np.ones(self.m) # self._absolute_distance_normalized(normalize) self._z_score_normalized(normalize) self.train_feas = self.dat[:int(train * self.n), :] self._split(int(train * self.n), int((train + valid) * self.n), self.n) # self.de_scale_std = torch.from_numpy(self.scale_std[:3]).float().to(self.device) # self.de_scale_mean = torch.from_numpy(self.scale_mean[:3]).float().to(self.device) # self.scale = torch.from_numpy(self.scale[:3]).float() # self.scale = self.scale.to(device) # self.scale = Variable(self.scale) def _z_score_normalized(self, normalize): for i in range(self.m): self.scale_mean[i] = np.mean(self.rawdat[:int(self.train_size * self.n), i]) self.scale_std[i] = np.std(self.rawdat[:int(self.train_size * self.n), i]) self.dat[:, i] = (self.rawdat[:, i] - self.scale_mean[i]) / self.scale_std[i] df = pd.DataFrame(self.dat) df.to_csv('data/data_set_z_score.csv', index=False) def _de_z_score_normalized(self, y, device_flag): if device_flag == 'cpu': de_scale_std = self.scale_std[:3]#.to(self.device) de_scale_mean = self.scale_mean[:3]#.to(self.device) return y * de_scale_std + de_scale_mean else: de_scale_std = torch.from_numpy(self.scale_std[:3]).float().to(self.device) de_scale_mean = torch.from_numpy(self.scale_mean[:3]).float().to(self.device) return y * de_scale_std + de_scale_mean def _absolute_distance_normalized(self, normalize): for i in range(self.m): self.scale[i] = np.max(np.abs(self.rawdat[:, i])) self.dat[:, i] = self.rawdat[:, i] / np.max(np.abs(self.rawdat[:, i])) def _split(self, train, valid, test): train_set = range(self.P + self.h - 1, train) valid_set = range(train, valid) test_set = range(valid, self.n) self.train = self._batchify(train_set, self.h) self.valid = self._batchify(valid_set, self.h) self.test = self._batchify(test_set, self.h) def _batchify(self, idx_set, horizon): # print("datshape", self.dat.shape) n = len(idx_set) X = torch.zeros((n, self.P, self.m)) Y = torch.zeros((n, self.h, self.m)) for i in range(n): end = idx_set[i] - self.h + 1 start = end - self.P X[i, :, :] = torch.from_numpy(self.dat[start:end, :]) Y[i, :, :] = torch.from_numpy(self.dat[idx_set[i] + 1 - horizon:idx_set[i] + 1, :]) return [X, Y] def get_batches(self, inputs, targets, batch_size, shuffle=True): length = len(inputs) if shuffle: index = torch.randperm(length) else: index = torch.LongTensor(range(length)) start_idx = 0 while (start_idx < length): end_idx = min(length, start_idx + batch_size) excerpt = index[start_idx:end_idx] X = inputs[excerpt] Y = targets[excerpt] X = X.to(self.device) Y = Y.to(self.device) yield Variable(X), Variable(Y) start_idx += batch_size class DataLoaderM(object): def __init__(self, xs, ys, batch_size, pad_with_last_sample=True): """ :param xs: :param ys: :param batch_size: :param pad_with_last_sample: pad with the last sample to make number of samples divisible to batch_size. """ self.batch_size = batch_size self.current_ind = 0 if pad_with_last_sample: num_padding = (batch_size - (len(xs) % batch_size)) % batch_size x_padding = np.repeat(xs[-1:], num_padding, axis=0) y_padding = np.repeat(ys[-1:], num_padding, axis=0) xs = np.concatenate([xs, x_padding], axis=0) ys = np.concatenate([ys, y_padding], axis=0) self.size = len(xs) self.num_batch = int(self.size // self.batch_size) self.xs = xs self.ys = ys def shuffle(self): permutation = np.random.permutation(self.size) xs, ys = self.xs[permutation], self.ys[permutation] self.xs = xs self.ys = ys def get_iterator(self): self.current_ind = 0 def _wrapper(): while self.current_ind < self.num_batch: start_ind = self.batch_size * self.current_ind end_ind = min(self.size, self.batch_size * (self.current_ind + 1)) x_i = self.xs[start_ind: end_ind, ...] y_i = self.ys[start_ind: end_ind, ...] yield (x_i, y_i) self.current_ind += 1 return _wrapper() class StandardScaler(): """ Standard the input """ def __init__(self, mean, std): self.mean = mean self.std = std def transform(self, data): return (data - self.mean) / self.std def inverse_transform(self, data): return (data * self.std) + self.mean def sym_adj(adj): """Symmetrically normalize adjacency matrix.""" adj = sp.coo_matrix(adj) rowsum = np.array(adj.sum(1)) d_inv_sqrt = np.power(rowsum, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) return adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).astype(np.float32).todense() def asym_adj(adj): """Asymmetrically normalize adjacency matrix.""" adj = sp.coo_matrix(adj) rowsum = np.array(adj.sum(1)).flatten() d_inv = np.power(rowsum, -1).flatten() d_inv[np.isinf(d_inv)] = 0. d_mat = sp.diags(d_inv) return d_mat.dot(adj).astype(np.float32).todense() def calculate_normalized_laplacian(adj): """ # L = D^-1/2 (D-A) D^-1/2 = I - D^-1/2 A D^-1/2 # D = diag(A 1) :param adj: :return: """ adj = sp.coo_matrix(adj) d = np.array(adj.sum(1)) d_inv_sqrt = np.power(d, -0.5).flatten() d_inv_sqrt[np.isinf(d_inv_sqrt)] = 0. d_mat_inv_sqrt = sp.diags(d_inv_sqrt) normalized_laplacian = sp.eye(adj.shape[0]) - adj.dot(d_mat_inv_sqrt).transpose().dot(d_mat_inv_sqrt).tocoo() return normalized_laplacian def calculate_scaled_laplacian(adj_mx, lambda_max=2, undirected=True): if undirected: adj_mx = np.maximum.reduce([adj_mx, adj_mx.T]) L = calculate_normalized_laplacian(adj_mx) if lambda_max is None: lambda_max, _ = linalg.eigsh(L, 1, which='LM') lambda_max = lambda_max[0] L = sp.csr_matrix(L) M, _ = L.shape I = sp.identity(M, format='csr', dtype=L.dtype) L = (2 / lambda_max * L) - I return L.astype(np.float32).todense() def load_pickle(pickle_file): try: with open(pickle_file, 'rb') as f: pickle_data = pickle.load(f) except UnicodeDecodeError as e: with open(pickle_file, 'rb') as f: pickle_data = pickle.load(f, encoding='latin1') except Exception as e: print('Unable to load data ', pickle_file, ':', e) raise return pickle_data def load_adj(pkl_filename): sensor_ids, sensor_id_to_ind, adj = load_pickle(pkl_filename) return adj def load_dataset(dataset_dir, batch_size, valid_batch_size=None, test_batch_size=None): data = {} for category in ['train', 'val', 'test']: cat_data = np.load(os.path.join(dataset_dir, category + '.npz')) data['x_' + category] = cat_data['x'] data['y_' + category] = cat_data['y'] scaler = StandardScaler(mean=data['x_train'][..., 0].mean(), std=data['x_train'][..., 0].std()) # Data format for category in ['train', 'val', 'test']: data['x_' + category][..., 0] = scaler.transform(data['x_' + category][..., 0]) data['train_loader'] = DataLoaderM(data['x_train'], data['y_train'], batch_size) data['val_loader'] = DataLoaderM(data['x_val'], data['y_val'], valid_batch_size) data['test_loader'] = DataLoaderM(data['x_test'], data['y_test'], test_batch_size) data['scaler'] = scaler return data def masked_mse(preds, labels, null_val=np.nan): if np.isnan(null_val): mask = ~torch.isnan(labels) else: mask = (labels != null_val) mask = mask.float() mask /= torch.mean((mask)) mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask) loss = (preds - labels) ** 2 loss = loss * mask loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss) return torch.mean(loss) def masked_rmse(preds, labels, null_val=np.nan): return torch.sqrt(masked_mse(preds=preds, labels=labels, null_val=null_val)) def masked_mae(preds, labels, null_val=np.nan): if np.isnan(null_val): mask = ~torch.isnan(labels) else: mask = (labels != null_val) mask = mask.float() mask /= torch.mean((mask)) mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask) loss = torch.abs(preds - labels) loss = loss * mask loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss) return torch.mean(loss) def masked_mape(preds, labels, null_val=np.nan): if np.isnan(null_val): mask = ~torch.isnan(labels) else: mask = (labels != null_val) mask = mask.float() mask /= torch.mean((mask)) mask = torch.where(torch.isnan(mask), torch.zeros_like(mask), mask) loss = torch.abs(preds - labels) / labels loss = loss * mask loss = torch.where(torch.isnan(loss), torch.zeros_like(loss), loss) return torch.mean(loss) def metric(pred, real): mae = masked_mae(pred, real, 0.0).item() mape = masked_mape(pred, real, 0.0).item() rmse = masked_rmse(pred, real, 0.0).item() return mae, mape, rmse def load_node_feature(path): fi = open(path) x = [] for li in fi: li = li.strip() li = li.split(",") e = [float(t) for t in li[1:]] x.append(e) x = np.array(x) mean = np.mean(x, axis=0) std = np.std(x, axis=0) z = torch.tensor((x - mean) / std, dtype=torch.float) return z def normal_std(x): return x.std() * np.sqrt((len(x) - 1.) / (len(x)))