import sys import argparse import copy import os, shutil import imageio import cv2 from PIL import Image, ImageDraw import os.path as osp import random import numpy as np import torch.multiprocessing as mp from multiprocessing import set_start_method import math, time, gc import torch import torch.nn.functional as F import matplotlib.pyplot as plt from segment_anything import SamAutomaticMaskGenerator, SamPredictor, sam_model_registry # Import files from the local path root_path = os.path.abspath('.') sys.path.append(root_path) from config.flowformer_config import get_cfg from flowformer_code.utils import flow_viz, frame_utils from flowformer_code.utils.utils import InputPadder from flowformer_code.FlowFormer import build_flowformer TRAIN_SIZE = [432, 960] def show_anns(anns): if len(anns) == 0: return sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True) ax = plt.gca() ax.set_autoscale_on(True) img = np.ones((sorted_anns[0]['segmentation'].shape[0], sorted_anns[0]['segmentation'].shape[1], 4)) img[:,:,3] = 0 for ann in sorted_anns: m = ann['segmentation'] color_mask = np.concatenate([np.random.random(3), [0.35]]) img[m] = color_mask return img*255 def show_mask(mask, random_color=False): if random_color: color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0) else: color = np.array([30/255, 144/255, 255/255, 0.6]) h, w = mask.shape[-2:] mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1) return mask_image * 255 def compute_grid_indices(image_shape, patch_size=TRAIN_SIZE, min_overlap=20): if min_overlap >= TRAIN_SIZE[0] or min_overlap >= TRAIN_SIZE[1]: raise ValueError( f"Overlap should be less than size of patch (got {min_overlap}" f"for patch size {patch_size}).") if image_shape[0] == TRAIN_SIZE[0]: hs = list(range(0, image_shape[0], TRAIN_SIZE[0])) else: hs = list(range(0, image_shape[0], TRAIN_SIZE[0] - min_overlap)) if image_shape[1] == TRAIN_SIZE[1]: ws = list(range(0, image_shape[1], TRAIN_SIZE[1])) else: ws = list(range(0, image_shape[1], TRAIN_SIZE[1] - min_overlap)) # Make sure the final patch is flush with the image boundary hs[-1] = image_shape[0] - patch_size[0] ws[-1] = image_shape[1] - patch_size[1] return [(h, w) for h in hs for w in ws] def compute_flow(model, image1, image2, weights=None): print(f"computing flow...") image_size = image1.shape[1:] image1, image2 = image1[None].cuda(), image2[None].cuda() hws = compute_grid_indices(image_size) if weights is None: # no tile padder = InputPadder(image1.shape) image1, image2 = padder.pad(image1, image2) flow_pre, _ = model(image1, image2) flow_pre = padder.unpad(flow_pre) flow = flow_pre[0].permute(1, 2, 0).cpu().numpy() else: # tile flows = 0 flow_count = 0 for idx, (h, w) in enumerate(hws): image1_tile = image1[:, :, h:h+TRAIN_SIZE[0], w:w+TRAIN_SIZE[1]] image2_tile = image2[:, :, h:h+TRAIN_SIZE[0], w:w+TRAIN_SIZE[1]] flow_pre, _ = model(image1_tile, image2_tile) padding = (w, image_size[1]-w-TRAIN_SIZE[1], h, image_size[0]-h-TRAIN_SIZE[0], 0, 0) flows += F.pad(flow_pre * weights[idx], padding) flow_count += F.pad(weights[idx], padding) flow_pre = flows / flow_count flow = flow_pre[0].permute(1, 2, 0).cpu().numpy() return flow def compute_adaptive_image_size(image_size): target_size = TRAIN_SIZE scale0 = target_size[0] / image_size[0] scale1 = target_size[1] / image_size[1] if scale0 > scale1: scale = scale0 else: scale = scale1 image_size = (int(image_size[1] * scale), int(image_size[0] * scale)) return image_size def prepare_image(viz_root_dir, fn1, fn2, keep_size): print(f"preparing image...") image1 = frame_utils.read_gen(fn1) image2 = frame_utils.read_gen(fn2) image1 = np.array(image1).astype(np.uint8)[..., :3] image2 = np.array(image2).astype(np.uint8)[..., :3] if not keep_size: dsize = compute_adaptive_image_size(image1.shape[0:2]) image1 = cv2.resize(image1, dsize=dsize, interpolation=cv2.INTER_CUBIC) image2 = cv2.resize(image2, dsize=dsize, interpolation=cv2.INTER_CUBIC) image1 = torch.from_numpy(image1).permute(2, 0, 1).float() image2 = torch.from_numpy(image2).permute(2, 0, 1).float() dirname = osp.dirname(fn1) filename = osp.splitext(osp.basename(fn1))[0] viz_dir = osp.join(viz_root_dir, dirname) # if not osp.exists(viz_dir): # os.makedirs(viz_dir) viz_fn = osp.join(viz_dir, filename + '.png') return image1, image2, viz_fn def build_model(): print(f"building model...") cfg = get_cfg() model = torch.nn.DataParallel(build_flowformer(cfg)) model.load_state_dict(torch.load(cfg.model)) model.cuda() model.eval() return model def filter_uv(flow, threshold_factor = 0.2): u = flow[:,:,0] v = flow[:,:,1] rad = np.sqrt(np.square(u) + np.square(v)) rad_max = np.max(rad) threshold = threshold_factor * rad_max flow[:,:,0][rad < threshold] = 0 flow[:,:,1][rad < threshold] = 0 return flow def visualize_traj(base_img, traj_path, connect_points = True): target_vertical, target_horizontal = traj_path[-1] if connect_points and len(traj_path) > 1: # Draw a line to connect two point to show motion direction start_coordinate = (traj_path[-2][1], traj_path[-2][0]) end_coordinate = (traj_path[-1][1], traj_path[-1][0]) pil_img = Image.fromarray(base_img) # Draw the line color = 'red' draw = ImageDraw.Draw(pil_img) draw.line([start_coordinate, end_coordinate], fill = color, width = 3) base_img = np.array(pil_img) # Draw a green dot only for the start point if len(traj_path) == 1: dot_range = 3 for i in range(-1*dot_range, dot_range+1): for j in range(-1*dot_range, dot_range+1): dil_vertical, dil_horizontal = target_vertical + i, target_horizontal + j if (0 <= dil_vertical and dil_vertical < base_img.shape[0]) and (0 <= dil_horizontal and dil_horizontal < base_img.shape[1]): base_img[dil_vertical][dil_horizontal] = [0, 128, 0] else: print("The traj is out of boundary!!!!!!!!!!!!!!!!!!!!! and we won't consider it") # 现在 return (False, base_img) return (True, base_img) def calculate_flow(viz_root_dir, store_dir, img_pairs, optical_flow_model, sam_predictor, SAM_positive_sample_num, SAM_negative_sample_num, mask_generator, traj_visualization, keep_size, verbose=False): # Trajectory prepare traj_path = [] # It collects all points traversed in a temporal order is_hard_to_track = False # If this is True, it means that, we have a time in tracking hard to find dx and dy movement. Under this circumstance, we are not very recommended to use it hard_track_idxs = set() traj_image_lists = [] # Iterate all image pairs for idx, img_pair in enumerate(img_pairs): fn1, fn2 = img_pair print(f"processing {fn1}, {fn2}...") image1, image2, viz_fn = prepare_image(viz_root_dir, fn1, fn2, keep_size) # Be very careful, image1 and image2 may be different resolution shape if keep_size is False # Generate the optical flow and filter those that is small motion flow_uv = filter_uv(compute_flow(optical_flow_model, image1, image2, None)) # if verbose: # Store the visualization of flow_uv # flow_img = flow_viz.flow_to_image(flow_uv) # cv2.imwrite("optical_flow_" + str(idx+1) + ".png", flow_img[:, :, [2,1,0]]) if idx == 0: # We will store the first image to memory for further visualization purpose # Base img # base_img = np.uint8(np.transpose(image1.numpy(), (1,2,0))) # SAM figure # sam_all = mask_generator.generate(image1) # base_img = show_anns(sam_all) # base_img = np.transpose(base_img, (1,2,0)) # Plain white image base_img = np.zeros(np.transpose(image1.numpy(), (1,2,0)).shape, dtype=np.uint8) base_img.fill(255) # Extract moving points (positive point) positive_point_cords = [] nonzeros = np.nonzero(flow_uv) # [(vertical), (horizontal)] if len(nonzeros[0]) < SAM_positive_sample_num: # We require the number of points to be more than SAM_positive_sample_num return False positive_orders = np.random.choice(len(nonzeros[0]), SAM_positive_sample_num, replace=False) # we have randomly select instead of use all in the sam_predictor prediction for i in range(len(nonzeros[0])): if i in positive_orders: positive_point_cords.append([nonzeros[1][i], nonzeros[0][i]]) # 根据document来看,这个就应该是先horizontal再vertical,也就是这个顺序 positive_point_cords = np.array(positive_point_cords) positive_point_labels = np.ones(len(positive_point_cords)) # Define negative sample (outside the optical flow choice) if SAM_negative_sample_num != 0: skip_prob = 2 * SAM_negative_sample_num / (flow_uv.shape[0]*flow_uv.shape[1] - len(nonzeros[0])) negative_point_cords = [] for i in range(flow_uv.shape[0]): for j in range(flow_uv.shape[1]): if flow_uv[i][j][0] == 0 and flow_uv[i][j][1] == 0: # 0 means the no motion zone and we have already filter low motion as zero before if random.random() < skip_prob: negative_point_cords.append([j, i]) # 根据document来看,这个就应该是先horizontal再vertical,也就是这个顺序 negative_point_cords = np.array(negative_point_cords) # [:SAM_negative_sample_num] negative_point_labels = np.zeros(len(negative_point_cords)) # Make sure that it is less than / equals to SAM_negative_sample_num quantity ################## Use SAM to filter out what we need (& use negative points) ################## if idx == 0: # Only consider the first frame now. # With sample coordinate sam_predictor.set_image(np.uint8(np.transpose(image1.numpy(), (1,2,0)))) if SAM_negative_sample_num != 0 and len(negative_point_cords) != 0: all_point_cords = np.concatenate((positive_point_cords, negative_point_cords), axis=0) all_point_labels = np.concatenate((positive_point_labels, negative_point_labels), axis=0) else: all_point_cords = positive_point_cords all_point_labels = positive_point_labels masks, scores, logits = sam_predictor.predict( point_coords=all_point_cords, point_labels=all_point_labels, multimask_output=False, ) mask = masks[0] # TODO: 一定要确定我们这里选择了最大的mask,而没有考虑的第二大和其他的, 这里可能有bug,我们默认了第一个就是最大的mask # if verbose: # cv2.imwrite("mask_"+str(idx+1)+".png", (np.uint8(mask)*255)) # annotated_img = show_mask(mask) # cv2.imwrite("annotated.png", annotated_img) ################## Choose the one we need as the reference for the future tracking ################## # Choose a random point in the mask target_zone = np.nonzero(mask) # [(vertical), (horizontal)] target_zone = [(target_zone[0][i], target_zone[1][i]) for i in range(len(target_zone[0]))] # Now, the sturcture is [(vertical, horizontal), ...] repeat_time = 0 loop2find = True while loop2find: loop2find = False start_point = target_zone[np.random.choice(len(target_zone), 1, replace=False)[0]] start_vertical, start_horizontal = start_point repeat_time += 1 if repeat_time == 100: # In some minor case, it may have infinite loop, so we need to manually break if it is looping print("We are still hard to find a optimal first point, but we cannot let it loop") break # Try to choose a start_point that is more centralized (Not close to the border) fast_break = False for i in range(-15, 15): for j in range(-15, 15): dil_vertical, dil_horizontal = start_vertical + i, start_horizontal + j if (0 <= dil_vertical and dil_vertical < mask.shape[0]) and (0 <= dil_horizontal and dil_horizontal < mask.shape[1]): if mask[dil_vertical][dil_horizontal] == 0: print("We need to change to a new position for the start p Since this one is close to the border of the object...........") loop2find = True fast_break = True break else: # We won't want to consider those that is close to the boundary print("We need to change to a new position Since this one is close to the border of the image...........") loop2find = True fast_break = True break if fast_break: break traj_path.append(start_point) status, base_img = visualize_traj(base_img, traj_path) if status == False: # If the traj is False, we won't consider it anymore. file = open("log.txt", "a") file.write("Invalid start point\n") return False # Read from the last one in traj ref_vertical, ref_horizontal = traj_path[-1][0], traj_path[-1][1] # Get the average motion vector for point surrounding (8+1 directions) the ref_point; This is because this is the most accurate statistics horizon_lists, vertical_lists = [], [] start_range, end_range = -5, 5 # Calculate the average motion based on surrounding motion search_times = 0 while len(horizon_lists) == 0: # If we cannot find a direction, we use average value inside this mask, but we will flag it. search_times += 1 if search_times > 1: print("This is hard to track!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! and we have tracked " + str(search_times) + " times") # TODO: 如果out of boundary那种,search times到了8-10次的就砍掉那后面frame吧,这种非常inaccurate了, 你也可以retrack一个新的点,但是没有什么意义,看整体数量来定吧 is_hard_to_track = True hard_track_idxs.add(idx) if abs(start_range) >= flow_uv.shape[0]//2: file = open("log.txt", "a") file.write("This folder has search all space but didn't find any place to track optical flow\n") return False # If we have already search for the whole graph but didn't find anything to track, we discard this sample # Search for a larger space which is nearby 我觉得扩大搜索范围应该是最稳定的选择吧 for i in range(start_range, end_range): for j in range(start_range, end_range): target_vertical, target_horizontal = ref_vertical + i, ref_horizontal + j if 0 <= target_vertical and target_vertical < flow_uv.shape[0] and 0 <= target_horizontal and target_horizontal < flow_uv.shape[1]: if flow_uv[target_vertical, target_horizontal, 0] == 0 or flow_uv[target_vertical, target_horizontal, 1] == 0: continue # Ignore zero vector to ensure only calculate moving position horizon_lists.append(flow_uv[target_vertical, target_horizontal, 0]) # Horizontal motion strength vertical_lists.append(flow_uv[target_vertical, target_horizontal, 1]) # Vertical motion strength # If there isn't any to search, we kepp on a larger space start_range -= 10 end_range += 10 average_dx = sum(horizon_lists)/len(horizon_lists) average_dy = sum(vertical_lists)/len(vertical_lists) print("average movement is ", (average_dx, average_dy)) traj_path.append(( int(traj_path[-1][0] + average_dy), int(traj_path[-1][1] + average_dx))) # Append the motion in independent order print(traj_path) ##################### Visualize the trajectory path (Debug Purpose) ##################### status, base_img = visualize_traj(base_img, traj_path) if status == False: # If the traj is False, we won't consider it anymore. return False cv2.imwrite(os.path.join(store_dir, "traj_path.png"), cv2.cvtColor(base_img, cv2.COLOR_BGR2RGB)) if traj_visualization: status, single_traj_img = visualize_traj(np.uint8(np.transpose(image1.numpy(), (1,2,0))), traj_path[:-1], connect_points=False) if status == False: # If the traj is False, we won't consider it anymore. return False traj_write_path = os.path.join(store_dir, "traj_"+str(idx)+".png") # cv2.imwrite(traj_write_path, cv2.cvtColor(single_traj_img, cv2.COLOR_BGR2RGB)) traj_image_lists.append(traj_write_path) # if traj_visualization: # images = [] # for filename in traj_image_lists: # images.append(imageio.imread(filename)) # # os.remove(filename) # Remove when used # imageio.mimsave(os.path.join(store_dir, 'traj_motion.gif'), images, duration=0.05) # TODO: 可以如果hard to track,就aggressivly多试即便,我们根据这个hard_track_idxs的长度来粗略判断哪个最好,三次里面选最好的 if is_hard_to_track: if len(hard_track_idxs) >= len(img_pairs)//3: # If more than half of the traj is hard to track, we need to consider discard this one file = open("log.txt", "a") file.write("we have a lot of times hard to find dx and dy movement. Under this circumstance, we are not very recommended to use the track\n") return False # Write a file store all position for further utilization txt_path = os.path.join(store_dir, "traj_data.txt") if os.path.exists(txt_path): os.remove(txt_path) file = open(txt_path, "a") for traj in traj_path: file.write(str(traj[0]) + " " + str(traj[1]) + "\n") # Save in numpy information # with open(os.path.join(store_dir, 'traj_data.npy'), 'wb') as f: # np.save(f, flow_uv) print("We write ", traj_path) return True def manage_seq_range(input_dir, store_dir, total_frame_needed): lists = os.listdir(input_dir) lists = lists[2:-2] num_frames_input = len(lists) if num_frames_input < total_frame_needed: print("The number of frames is too short for constructing the sequnece length needed") return False division_factor = num_frames_input // total_frame_needed remain_frame = num_frames_input % total_frame_needed gaps = [division_factor for _ in range(total_frame_needed)] for idx in range(remain_frame): gaps[idx] += 1 cur_idx = 2 for global_idx, gap in enumerate(gaps): source_path = os.path.join(input_dir, "im_"+str(cur_idx)+".jpg") destination_path = os.path.join(store_dir, "im_"+str(global_idx)+".jpg") shutil.copyfile(source_path, destination_path) cur_idx += gap return True def generate_pairs(dirname, start_idx, end_idx): img_pairs = [] for idx in range(start_idx, end_idx): img1 = osp.join(dirname, f'im_{idx}.jpg') img2 = osp.join(dirname, f'im_{idx+1}.jpg') # img1 = f'{idx:06}.png' # img2 = f'{idx+1:06}.png' img_pairs.append((img1, img2)) return img_pairs def process_partial_request(request_list, num_frames, traj_visualization, viz_root_dir): # Init the optical flow model optical_flow_model = build_model() # Init SAM for segmentation task model_type = "vit_h" weight_path = "pretrained/sam_vit_h_4b8939.pth" SAM_positive_sample_num = 20 # How many points we use for the positive sample num () SAM_negative_sample_num = 0 # How many points we use for the negative sample num print("In multi processing, we will build an instance of mask_generator independently") sam = sam_model_registry[model_type](checkpoint=weight_path).to(device="cuda") mask_generator = SamAutomaticMaskGenerator(sam) print("In multi processing, we will build an instance of sam_predictor independently") sam_predictor = SamPredictor(sam) counter = 0 while True: counter += 1 if counter == 10: counter = 0 gc.collect() print("We will sleep here to clear memory") time.sleep(5) info = request_list[0] request_list = request_list[1:] if info == None: print("This queue ends") break # Process each sub_input_dir and store the information there sub_input_dir = info img_pairs = generate_pairs(sub_input_dir, 0, num_frames-1) print(img_pairs) with torch.no_grad(): # Calculate the optical flow and return a status to say whther this generated flow is usable status = calculate_flow(viz_root_dir, sub_input_dir, img_pairs, optical_flow_model, sam_predictor, SAM_positive_sample_num, SAM_negative_sample_num, mask_generator, traj_visualization, keep_size = True) # file = open("log.txt", "a") print("The status for folder " + sub_input_dir + " is " + str(status) + "\n") if status == False: # If the status is failed, we will remove it afterwords print("The status is Failed, so we won't store this one as one promising data") else: print("We have successfully process one!") if __name__ == '__main__': # Manage the paramter parser = argparse.ArgumentParser() parser.add_argument('--input_dir', default = '../validation_flow14/') parser.add_argument('--num_workers', type = int, default = 1) # starting index of the image sequence parser.add_argument('--viz_root_dir', default = 'viz_results') parser.add_argument('--traj_visualization', default = True) # If this is True, # list_start = 0 # list_end = 25000 num_frames = 14 args = parser.parse_args() input_dir = args.input_dir num_workers = args.num_workers viz_root_dir = args.viz_root_dir traj_visualization = args.traj_visualization store_idx = 0 dir_list = [] for sub_input_name in sorted(os.listdir(input_dir)): sub_input_dir = os.path.join(input_dir, sub_input_name) # sub_store_dir = os.path.join(store_dir, "0"*(7-len(str(store_idx)))+str(store_idx)) store_idx += 1 dir_list.append(sub_input_dir) # Truncate the list to the target # dir_list = dir_list[list_start:] # Use multiprocessing to handle to speed up num = math.ceil(len(dir_list) / num_workers) for idx in range(num_workers): # set_start_method('spawn', force=True) request_list = dir_list[:num] request_list.append(None) dir_list = dir_list[num:] process_partial_request(request_list, num_frames, traj_visualization, viz_root_dir) # This is for debug purpose # p = mp.Process(target=process_partial_request, args=(request_list, num_frames, traj_visualization, viz_root_dir, )) # p.start() print("Submitted all jobs!") # p.join() # 好像不加这个multiprocess就莫名自己结束了 print("All task finished!")