import numpy import igl import numpy as np import torch import time from scipy.sparse import diags,coo_matrix from scipy.sparse import csc_matrix as sp_csc USE_TORCH_SPARSE = True ## This uses TORCH_SPARSE instead of TORCH.SPARSE # This four are mutually exclusive USE_CUPY = False ## This uses CUPY LU decomposition on GPU USE_CHOLESPY_GPU = True ## This uses cholesky decomposition on GPU USE_CHOLESPY_CPU = False ## This uses cholesky decomposition on CPU USE_SCIPY = False ## This uses CUPY LU decomposition on CPU # If USE_SCIPY = True, wether or not to use enhanced backend USE_SCIKITS_UMFPACK = False ## This uses UMFPACK backend for scipy instead of naive scipy. if USE_CHOLESPY_GPU or USE_CHOLESPY_CPU: from cholespy import CholeskySolverD, MatrixType if USE_CUPY and torch.cuda.is_available(): from cupyx.scipy.sparse.linalg import spsolve_triangular from cupyx.scipy.sparse import csr_matrix import cupy from torch.utils.dlpack import to_dlpack, from_dlpack from scipy.sparse.linalg import splu as scipy_splu from scipy.sparse.linalg import spsolve_triangular, spsolve if USE_SCIPY: if USE_SCIKITS_UMFPACK: # This is a bit slower in practice # https://stackoverflow.com/questions/64401503/is-there-a-way-to-further-improve-sparse-solution-times-using-python from scikits.umfpack import splu as scipy_splu else: import scipy.sparse.linalg as lg lg.use_solver(useUmfpack=False) # Slight performance gain with True # conda install -c conda-forge scikit-umfpack # forward pass goes from 0.038 to 0.036 # assumeSortedIndices=True Does not bring any boost from scipy.sparse.linalg import splu as scipy_splu from scipy.sparse.linalg import spsolve_triangular, spsolve if USE_TORCH_SPARSE: try: import torch_sparse except ImportError: print("Warning: torch_sparse not available, falling back to built-in PyTorch sparse operations") USE_TORCH_SPARSE = False USE_UGLY_PATCH_FOR_CUPY_ERROR = False class SparseMat: ''' Sparse matrix object represented in the COO format Refacto : consider killing this object, byproduct of torch_sparse instead of torch.sparse (new feature) ''' @staticmethod def from_M(M,ttype): return SparseMat(M[0],M[1],M[2],M[3],ttype) @staticmethod def from_coo(coo,ttype): inds = numpy.vstack((coo.row,coo.col)) return SparseMat(inds,coo.data,coo.shape[0],coo.shape[1],ttype) def __init__(self,inds,vals,n,m,ttype): self.n = n self.m = m self.vals = vals self.inds = inds assert(inds.shape[0] == 2) assert(inds.shape[1] == vals.shape[0]) assert(np.max(inds[0,:]) <= n) assert(np.max(inds[1,:] <= m)) #TODO figure out how to extract the I,J,V,m,n from this, then load a COO mat directly from npz #self.coo_mat = coo_matrix((cupy.array(self.vals), (cupy.array(self.inds[0,:]), cupy.array(self.inds[1,:])))) self.vals = torch.from_numpy(self.vals).type(ttype).contiguous() self.inds = torch.from_numpy(self.inds).type(torch.int64).contiguous() def to_coo(self): return coo_matrix((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m)) def to_csc(self): return sp_csc((self.vals, (self.inds[0,:], self.inds[1,:])), shape = (self.n, self.m)) def to_cholesky(self): return CholeskySolverD(self.n, self.inds[0,:], self.inds[1,:], self.vals, MatrixType.COO) def to(self,device): self.vals = self.vals.to(device) self.inds = self.inds.to(device) return self def pin_memory(self): return # self.vals.pin_memory() # self.inds.pin_memory() def multiply_with_dense(self,dense): if USE_TORCH_SPARSE: res = torch_sparse.spmm(self.inds,self.vals, self.n, self.m, dense) # 1000 for loop on the above line takes 0.13 sec. Fast but annoying to have this dependency else: # Somehow this is not implemented for now? # res = torch.smm(torch.sparse_coo_tensor(self.inds,self.vals) , (dense.float())).to_dense().to(dense.device) # 1000 for loop on the above line takes 10 sec on the CPU. It is not implemented on gpu yet Slower but no dependency if self.vals.device.type == 'cpu': tensor_zero_hack = torch.FloatTensor([0]).double() # This line was somehow responsible for a nasty NAN bug else: tensor_zero_hack = torch.cuda.FloatTensor([0]).to(dense.get_device()).double() # beware with addmm, it is experimental and gave me a NaN bug! res = torch.sparse.addmm(tensor_zero_hack, torch.sparse_coo_tensor(self.inds.double(),self.vals.double()) , (dense.double())).type_as(self.vals) # 1000 for loop on the above line takes 0.77 sec. Slower but no dependency return res.contiguous() class PoissonSystemMatrices: ''' Holds the matrices needed to perform gradient and poisson computations Logic : this class is supposed is supposed to hold everything needed to compute Poisson Solver Refacto : merge with Poisson Solver Only accept SparseMat representation ''' def __init__(self, V, F,grad, rhs, w, ttype, is_sparse = True, lap = None, cpuonly=False): self.dim = 3 self.is_sparse = is_sparse self.w = w self.rhs = rhs self.igl_grad = grad self.ttype = ttype self.__splu_L = None self.__splu_U = None self.__splu_perm_c = None self.__splu_perm_r = None self.lap = lap self.__V = V self.__F = F self.cpuonly = cpuonly self.cpu_splu = None def create_poisson_solver(self): return PoissonSolver(self.igl_grad,self.w,self.rhs, None, self.lap) def create_poisson_solver_from_splu_old(self, lap_L, lap_U, lap_perm_c, lap_perm_r): w = torch.from_numpy(self.w).type(self.ttype) lap = None my_splu = None if not self.cpuonly: if USE_CUPY: my_splu = MyCuSPLU(lap_L, lap_U, lap_perm_c, lap_perm_r) else: if self.lap is not None: lap = self.lap # my_splu = scipy_splu(self.lap) # my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r) else: my_splu = MyCuSPLU_CPU(lap_L, lap_U, lap_perm_c, lap_perm_r) # st = time.time() # my_splu = scipy_splu(lap_L@lap_U) # print(f"time for LU: {time.time() - st}" ) else: if self.lap is not None: my_splu = scipy_splu(self.lap) else: 0/0 # my_splu = splu(lap_L) return PoissonSolver(self.igl_grad,w,self.rhs,my_splu, lap) def compute_poisson_solver_from_laplacian(self, compute_splu=True): self.compute_laplacian() if compute_splu: self.compute_splu() return self.create_poisson_solver_from_splu(self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r) def compute_laplacian(self): if self.lap is None: self.lap = igl.cotmatrix(self.__V,self.__F) self.lap = self.lap[1:, 1:] self.lap = SparseMat.from_coo(self.lap.tocoo(), torch.float64) if isinstance(self.lap,PoissonSystemMatrices) and self.lap.vals.shape[0] == self.__V.shape[0]: assert(False), "this should not happen, the fix is to remove a column and row of the laplacian" self.lap = self.lap[1:, 1:] return self.lap def compute_splu(self): print("i am computing splu") if self.cpu_splu is None: # st = time.time() s = scipy_splu(self.lap) # print(f"time to compute LU {time.time() - st}") # We are storing these attributes just in case we need to create a PoissonSolver on the GPU, they are useless for CPU case. self.cpu_splu = s self.__splu_L = s.L self.__splu_U = s.U self.__splu_perm_c = s.perm_c self.__splu_perm_r = s.perm_r return self.__splu_L,self.__splu_U,self.__splu_perm_c,self.__splu_perm_r def get_new_grad(self): grad = self.igl_grad.to_coo() self.igl_grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad.tocsc()),torch.float64) return self.igl_grad def _convert_sparse_igl_grad_to_our_convention(input): ''' The grad operator computed from igl.grad() results in a matrix of shape (3*#tri x #verts). It is packed such that all the x-coordinates are placed first, followed by y and z. As shown below ---------- ---------- | x1 ... | x1 ... | x2 ... | y1 ... | x3 ... | z1 ... | . | . | . | . | y1 ... | x2 ... | y2 ... ----> | y2 ... | y3 ... | z2 ... | . | . | . | . | z1 ... | x3 ... | z2 ... | y3 ... | z3 ... | z3 ... | . | . | . | . ---------- ---------- Note that this functionality cannot be computed trivially if because igl.grad() is a sparse tensor and as such slicing is not well defined for sparse matrices. the following code performs the above conversion and returns a torch.sparse tensor. Set check to True to verify the results by converting the matrices to dense and comparing it. ''' assert type(input) == sp_csc, 'Input should be a scipy csc sparse matrix' T = input.tocoo() r_c_data = np.hstack((T.row[..., np.newaxis], T.col[..., np.newaxis], T.data[..., np.newaxis])) # horizontally stack row, col and data arrays r_c_data = r_c_data[r_c_data[:, 0].argsort()] # sort along the row column # Separate out x, y and z blocks ''' Note that for the grad operator there are exactly 3 non zero elements in a row ''' L = T.shape[0] Tx = r_c_data[:L, :] Ty = r_c_data[L:2 * L, :] Tz = r_c_data[2 * L:3 * L, :] # align the y,z rows with x so that they too start from 0 Ty[:, 0] -= Ty[0, 0] Tz[:, 0] -= Tz[0, 0] # 'strech' the x,y,z rows so that they can be interleaved. Tx[:, 0] *= 3 Ty[:, 0] *= 3 Tz[:, 0] *= 3 # interleave the y,z into x Ty[:, 0] += 1 Tz[:, 0] += 2 Tc = np.zeros((input.shape[0] * 3, 3)) Tc[::3] = Tx Tc[1::3] = Ty Tc[2::3] = Tz indices = Tc[:, :-1].astype(int) data = Tc[:, -1] return (indices.T, data, input.shape[0], input.shape[1]) class PoissonSolver: ''' an object to compute gradients and solve poisson ''' def __init__(self,grad,W,rhs,my_splu, lap=None): self.W = torch.from_numpy(W).double() self.grad = grad self.rhs = rhs self.my_splu = my_splu self.lap = lap self.sparse_grad = grad self.sparse_rhs = rhs def to(self,device): self.W = self.W.to(device) self.sparse_grad = self.sparse_grad.to(device) self.sparse_rhs = self.sparse_rhs.to(device) if USE_CUPY or USE_CHOLESPY_GPU: self.lap = self.lap.to(device) return self def jacobians_from_vertices(self,V): res = _multiply_sparse_2d_by_dense_3d(self.sparse_grad, V).type_as(V) res = res.unsqueeze(2) return res.view(V.shape[0], -1, 3,3).transpose(2,3) def restrict_jacobians(self,D): assert isinstance(D, torch.Tensor) and len(D.shape) in [3, 4] assert D.shape[-1] == 3 and D.shape[-2] == 3 assert isinstance(self.W, torch.Tensor) and len(self.W.shape) == 3 assert self.W.shape[-1] == 2 and self.W.shape[-2] == 3 if len(D.shape) == 4: DW = torch.einsum("abcd,bde->abce", (D, self.W.type_as(D))) else: DW = torch.einsum("abcd,bde->abce", (D.unsqueeze(0), self.W)).squeeze(0) if len(DW.shape)>4: DW = DW.squeeze(0) return DW def restricted_jacobians_from_vertices(self,V): return self.restrict_jacobians(self.jacobians_from_vertices(V)) def solve_poisson(self,jacobians): # st = time.time() assert(len(jacobians.shape) == 4) assert(jacobians.shape[2] == 3 and jacobians.shape[3] == 3) # torch.cuda.synchronize() # st = time.time() if self.my_splu is None: if isinstance(self.lap,SparseMat): # self.my_splu = scipy_splu(self.lap.to('cpu').to_coo()) if USE_CHOLESPY_CPU or USE_CHOLESPY_GPU: self.my_splu = self.lap.to_cholesky() else: self.my_splu = scipy_splu(self.lap.to('cpu').to_coo()) else: self.my_splu = scipy_splu(self.lap) # print(f"computing poisson! {self.lap.vals.get_device()}") # print(f"computing poisson! {self.lap.inds.get_device()}") # print(f"computing poisson! {jacobians.get_device()}") # print(f"computing poisson! {self.sparse_rhs.vals.get_device()}") # torch.cuda.synchronize() # print(f"SOLVER decomposition {time.time() - st}") sol = _predicted_jacobians_to_vertices_via_poisson_solve(self.my_splu, self.sparse_rhs, jacobians.transpose(2, 3).reshape(jacobians.shape[0], -1, 3, 1).squeeze(3).contiguous()) # torch.cuda.synchronize() # print(f"POISSON LU + SOLVE FORWARD{time.time() - st}") c = torch.mean(sol, axis=1).unsqueeze(1) ## Beware the predicted mesh is centered here. # print(f"time for poisson: {time.time() - st}" ) return sol - c def pin_memory(self): return # self.W.pin_memory() # self.sparse_grad.pin_memory() # self.sparse_rhs.pin_memory() def poisson_system_matrices_from_mesh( V,F, dim=3,ttype = torch.float64, is_sparse=True,cpuonly=False): ''' compute poisson matricees for a given mesh :param V vertices :param F faces :param dim: for now always 3 :) :param ttype the type of tensor (e.g., float,double) :param is_sparse: for now always true :return: a PoissonMatricese object holding the computed matrices ''' assert type(dim) == int and dim in [2,3], f'Only two and three dimensional meshes are supported' assert type(is_sparse) == bool vertices = V faces = F dim = 3 is_sparse = is_sparse grad = igl.grad(vertices, faces) # grad = np.abs(grad) # temp_grad = grad.multiply(csr_matrix(1 / np.sqrt(grad.multiply(grad).sum(1)))) # gradients_normalized = grad / np.linalg.norm(grad, axis=1)[:, np.newaxis] mass = _get_mass_matrix(vertices,faces,is_sparse) ## TODO 2D Case ## if dim == 2: grad = grad[:-grad.shape[0]//3,:] mass = mass[:-mass.shape[0]//3,:-mass.shape[0]//3] laplace = grad.T@mass@grad laplace = laplace[1:, 1:] rhs = grad.T@mass b1,b2,_ = igl.local_basis(V,F) w = np.stack((b1,b2),axis=-1) # print(time.time() - s) rhs = rhs[1:,:] if is_sparse: laplace = laplace.tocoo() rhs = rhs.tocoo() grad = grad.tocsc() else: laplace = laplace.toarray() rhs = rhs.toarray() grad = grad.toarray() grad = SparseMat.from_M(_convert_sparse_igl_grad_to_our_convention(grad), torch.float64) poissonbuilder = PoissonSystemMatrices(V=V,F=F,grad=grad, rhs=SparseMat.from_coo(rhs, torch.float64), w=w, ttype=ttype,is_sparse=is_sparse, lap=SparseMat.from_coo(laplace, torch.float64), cpuonly=cpuonly) # poissonbuilder.get_new_grad() return poissonbuilder def _get_mass_matrix(vertices,faces,is_sparse): d_area = igl.doublearea(vertices,faces) d_area = np.hstack((d_area, d_area, d_area)) if is_sparse: return sp_csc(diags(d_area)) return diags(d_area) class SPLUSolveLayer(torch.autograd.Function): ''' Implements the SPLU solve as a differentiable layer, with a forward and backward function ''' @staticmethod def forward(ctx, solver, b): ''' override forward function :param ctx: context object (to keep the lu object for the backward pass) :param lu: splu object :param b: right hand side, could be a vector or matrix :return: the vector or matrix x which holds lu.solve(b) = x ''' assert isinstance(b, torch.Tensor) assert b.shape[-1] >= 1 and b.shape[-1] <= 3, f'got shape {b.shape} expected last dim to be in range 1-3' b = b.contiguous() ctx.solver = solver # st = time.time() vertices = SPLUSolveLayer.solve(solver, b).type_as(b) # print(f"FORWARD SOLVE {time.time() - st}") assert not torch.isnan(vertices).any(), "Nan in the forward pass of the POISSON SOLVE" return vertices def backward(ctx, grad_output): ''' overrides backward function :param grad_output: the gradient to be back-propped :return: the outgoing gradient to be back-propped ''' assert isinstance(grad_output, torch.Tensor) assert grad_output.shape[-1] >= 1 and grad_output.shape[ -1] <= 3, f'got shape {grad_output.shape} expected last dim to be in range 1-3' # when backpropping, if a layer is linear with matrix M, x ---> Mx, then the backprop of gradient g is M^Tg # in our case M = A^{-1}, so the backprop is to solve x = A^-T g. # Because A is symmetric we simply solve A^{-1}g without transposing, but this will break if A is not symmetric. # st = time.time() grad_output = grad_output.contiguous() grad = SPLUSolveLayer.solve(ctx.solver, grad_output) # print(f"BACKWARD SOLVE {time.time() - st}") # At this point we perform a NAN check because the backsolve sometimes returns NaNs. assert not torch.isnan(grad).any(), "Nan in the backward pass of the POISSON SOLVE" if USE_CUPY: mempool = cupy.get_default_memory_pool() pinned_mempool = cupy.get_default_pinned_memory_pool() mempool.free_all_blocks() pinned_mempool.free_all_blocks() del ctx.lu return None, grad @staticmethod def solve(solver, b): ''' solve the linear system defined by an SPLU object for a given right hand side. if the RHS is a matrix, solution will also be a matrix. :param solver: the splu object (LU decomposition) or cholesky object :param b: the right hand side to solve for :return: solution x which satisfies Ax = b where A is the poisson system lu describes ''' if USE_CUPY: b_cupy = cupy.fromDlpack(to_dlpack(b)) with cupy.cuda.Device(solver.device()): # this will hold the solution sol = cupy.ndarray(b_cupy.shape) for i in range(b_cupy.shape[2]): # b may have multiple columns, solve for each one b2d = b_cupy[..., i] # cupy.expand_dims(b_cpu[...,i],2) s = solver.solve(b2d.T).T sol[:, :, i] = s # # # convert back to torch res = from_dlpack(sol.toDlpack()) # np.save("res_gpu.npy", res.cpu().numpy()) # res = torch.zeros((1, 6889, 3), device=b.device)+ torch.mean(b) return res.type_as(b.type()) elif USE_SCIPY: #only CPU # st = time.time() assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" sol = solver.solve(b[0].double().cpu().numpy()) res = torch.from_numpy(sol).to(b.device).reshape(b.shape) # print(time.time() - st) return res.type_as(b).contiguous() # Legacy code, I don't understand what is the reason for having a for loop # sol = np.ndarray(b.shape) # for i in range(b.shape[2]): # b may have multiple columns, solve for each one # b2d = b[..., i] # cupy.expand_dims(b_cpu[...,i],2) # s = lu.solve(b2d.double().cpu().float().numpy().T).T # sol[:, :, i] = s # res = torch.from_numpy(sol).to(b.device) # # np.save("res_cpu.npy", sol) # print(f"time {time.time() - st}" ) elif USE_CHOLESPY_GPU: # torch.cuda.synchronize() # # st = time.time() # assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" # b = b.squeeze().double() # x = torch.zeros_like(b) # solver.solve(b, x) # # torch.cuda.synchronize() # # print(f"time cholescky GPU {time.time() - st}" ) # return x.contiguous().unsqueeze(0) # st = time.time() # print(b.get_device(), b.shape) b = b.double().contiguous() c = b.permute(1,2,0).contiguous() c = c.view(c.shape[0], -1) x = torch.zeros_like(c) solver.solve(c, x) x = x.view(b.shape[1], b.shape[2], b.shape[0]) x = x.permute(2,0,1).contiguous() # torch.cuda.synchronize() # print(f"time cholescky GPU {time.time() - st}" ) return x.contiguous() elif USE_CHOLESPY_CPU: # st = time.time() assert(b.shape[0]==1), "Need to code parrallel implem on the first dim" b = b.squeeze() b_cpu = b.cpu() x = torch.zeros_like(b_cpu) solver.solve(b_cpu, x) # print(f"time cholescky CPU {time.time() - st}" ) return x.contiguous().to(b.device).unsqueeze(0) return res.type_as(b) def _predicted_jacobians_to_vertices_via_poisson_solve(Lap, rhs, jacobians): ''' convert the predictions to the correct convention and feed it to the poisson solve ''' def _batch_rearrange_input(input): assert isinstance(input, torch.Tensor) and len(input.shape) in [2, 3] P = torch.zeros(input.shape).type_as(input) if len(input.shape) == 3: # Batched input k = input.shape[1] // 3 P[:, :k, :] = input[:, ::3] P[:, k:2 * k, :] = input[:, 1::3] P[:, 2 * k:, :] = input[:, 2::3] else: k = input.shape[0] // 3 P[:k, :] = input[::3] P[k:2 * k, :] = input[1::3] P[2 * k:, :] = input[2::3] return P def _list_rearrange_input(input): assert isinstance(input, list) and all([isinstance(x, torch.Tensor) and len(x.shape) in [2, 3] for x in input]) P = [] for p in input: P.append(_batch_rearrange_input(p)) return P if isinstance(jacobians, list): P = _list_rearrange_input(jacobians) else: P = _batch_rearrange_input(jacobians) # return solve_poisson(Lap, rhs, P) assert isinstance(P, torch.Tensor) and len(P.shape) in [2, 3] assert len(P.shape) == 3 # torch.cuda.synchronize() # st = time.time() P = P.double() input_to_solve = _multiply_sparse_2d_by_dense_3d(rhs, P) out = SPLUSolveLayer.apply(Lap, input_to_solve) out = torch.cat([torch.zeros(out.shape[0], 1, out.shape[2]).type_as(out), out], dim=1) ## Why?? Because! out = out - torch.mean(out, axis=1, keepdim=True) return out.type_as(jacobians) def _multiply_sparse_2d_by_dense_3d(mat, B): ret = [] for i in range(B.shape[0]): C = mat.multiply_with_dense(B[i, ...]) ret.append(C) ret = torch.stack(tuple(ret)) return ret class MyCuSPLU: ''' implmentation of SPLU on the gpu via CuPy ''' def __init__(self, L, U, perm_c=None, perm_r=None): # with cupy.cuda.Device(device): self.__orgL = L self.__orgU = U # self.L = csr_matrix(L) # self.U = csr_matrix(U) self.L = None self.U = None self.perm_c = perm_c self.perm_r = perm_r # self.splu = cu_splu(csr_matrix(lap)) # self.L = self.splu.L # self.U = self.splu.U # self.perm_c = self.splu.perm_c # self.perm_r = self.splu.perm_r self.__device = None def to(self, device): # assumes to receive a pytorch device object that has a "index" field # print(device) # if(self.__device is None): # raise Exception() self.__device = device.index with cupy.cuda.Device(self.__device): # self.__orgL = cupy.asarray(self.__orgL) # self.__orgU = cupy.asarray(self.__orgU) self.L = csr_matrix(self.__orgL) self.U = csr_matrix(self.__orgU) return self def device(self): return self.__device def solve(self, b): """ an attempt to use SuperLU data to efficiently solve Ax = Pr.T L U Pc.T x = b - note that L from SuperLU is in CSC format solving for c results in an efficiency warning Pr . A . Pc = L . U Lc = b - forward solve for c c = Ux - then back solve for x """ assert self.__device is not None, "need to explicitly call to() before solving" if USE_UGLY_PATCH_FOR_CUPY_ERROR: with cupy.cuda.Device(0): b[:1, :1].copy()[:, :1] with cupy.cuda.Device(self.__device): b = cupy.array(b) if self.perm_r is not None: b_old = b.copy() b[self.perm_r] = b_old assert b.device.id == self.__device, "got device" + str(b.device.id) + "instead of" + str(self.__device) # st = time.time() try: # unit_diagonal is a new kw c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True) except TypeError: c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True) px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True) # print(f"time for spsolve_triangular GPU: {time.time() - st}" ) if self.perm_c is None: return px px = px[self.perm_c] # print(f'used: {mempool.used_bytes()}') # print(f'total: {mempool.total_bytes()}') return px class MyCuSPLU_CPU: ''' implmentation of SPLU on the gpu via CuPy ''' def __init__(self, L, U, perm_c=None, perm_r=None): # with cupy.cuda.Device(device): self.__orgL = L self.__orgU = U # self.L = csr_matrix(L) # self.U = csr_matrix(U) self.L = L self.U = U # self.L = L.tocsr() # self.U = U.tocsr() self.perm_c = perm_c self.perm_r = perm_r # self.splu = cu_splu(csr_matrix(lap)) # self.L = self.splu.L # self.U = self.splu.U # self.perm_c = self.splu.perm_c # self.perm_r = self.splu.perm_r self.__device = 'cpu' def to(self, device): # assumes to receive a pytorch device object that has a "index" field # print(device) # if(self.__device is None): # raise Exception() # self.__device = device.index # with cupy.cuda.Device(self.__device): # # self.__orgL = cupy.asarray(self.__orgL) # # self.__orgU = cupy.asarray(self.__orgU) # self.L = csr_matrix(self.__orgL) # self.U = csr_matrix(self.__orgU) return self def device(self): return self.__device def solve(self, b): """ an attempt to use SuperLU data to efficiently solve Ax = Pr.T L U Pc.T x = b - note that L from SuperLU is in CSC format solving for c results in an efficiency warning Pr . A . Pc = L . U Lc = b - forward solve for c c = Ux - then back solve for x """ # Could be done on GPU if self.perm_r is not None: b_old = b.copy() b[self.perm_r] = b_old # , permc_spec="NATURAL" # , permc_spec="NATURAL" # , permc_spec="NATURAL" st = time.time() # try: # unit_diagonal is a new kw # c = spsolve_triangular(self.L, b, lower=True, unit_diagonal=True, overwrite_b=True) # except TypeError: # c = spsolve_triangular(self.L, b, lower=True, overwrite_b=True) # px = spsolve_triangular(self.U, c, lower=False, overwrite_b=True) try: # unit_diagonal is a new kw c = spsolve(self.L, b, permc_spec="NATURAL") except TypeError: c = spsolve(self.L, b, permc_spec="NATURAL") px = spsolve(self.U, c, permc_spec="NATURAL") # # (self.L * c) - b / np.norm(b) print(f"time for spsolve_triangular CPU: {time.time() - st}" ) if self.perm_c is None: return px px = px[self.perm_c] # print(f'used: {mempool.used_bytes()}') # print(f'total: {mempool.total_bytes()}') return px # return cupy.asnumpy(px)