Source code for UM2N.loader.data_transform

# Author: Chunyang Wang
# Github: chunyang-w

# In this file, we want to add extra edges within a range. All modification
# should be made in a 'in place' fashion. So disk space is not a concern.

# We need these functionalities:

#    1. Iterate through all the files in a directory, 'train', 'test' and 'val'
#    2. For each file, we need to read the file, and add extra edges

import glob
import os
import sys
from argparse import ArgumentParser

import numpy as np
import torch

cur_dir = os.path.dirname(__file__)
sys.path.append(cur_dir)
from cluster_utils import get_new_edges  # noqa


[docs] def arg_parse(): parser = ArgumentParser() parser.add_argument( "--target", type=str, default=None, help=( ("target directory. This dir should contain "), ("`train`, `test` and `val` subdirs."), ), ) parser.add_argument("--r", type=float, default=0.35, help="radius of a cluster") parser.add_argument("--M", type=int, default=None, help="nodes in a cluster") parser.add_argument( "--dist_weight", type=bool, default=False, help=( "use weighted probability to sample " + "nodes (according to distance to source)" ), ) parser.add_argument( "--add_nei", type=bool, default=False, help=("add original neighbors to the cluster"), ) args_ = parser.parse_args() print(args_) return args_
[docs] def add_edges(file_path, r, M, dist_weight, add_nei): """ Add extra edges to the file 1. Read the file 1.1 get num_nodes 1.2 get x 1.3 get original edge_index 1.4 get 2. Add extra edges 3. Save the file """ # read in data data = np.load(file_path, allow_pickle=True) data_object = data.item() coords = torch.from_numpy(data_object.get("coord")) num_nodes = coords.shape[0] edge_index = torch.from_numpy(data_object.get("edge_index_bi")).to(torch.int64) new_edges = get_new_edges(num_nodes, coords, edge_index, r, M, dist_weight, add_nei) data_object["cluster_edges"] = new_edges # save the file np.save(file_path, data_object) return
[docs] def process_subset(file_path, r, M, dist_weight, add_nei): file_pattern = os.path.join(file_path, "data_*.npy") files = glob.glob(file_pattern) # print("files: ", files) print(f"processing {len(files)} files in{file_path}") for file in files: add_edges(file, r, M, dist_weight, add_nei) return
if __name__ == "__main__": print("Processing the dataset...") # define all the subdirectories all_folders = ["data", "test", "train", "val"] # parse arguments, get the target directory and cluster radius args_ = arg_parse() dataset_root = args_.target r = args_.r M = args_.M # dist_weight = True if args_.dist_weight == "True" else False # add_nei = True if args_.add_nei == "True" else False dist_weight = args_.dist_weight add_nei = args_.add_nei # get all the subdirectories subsets_path = [os.path.join(dataset_root, folder) for folder in all_folders] # iterate through all the subsets for i in range(len(subsets_path)): process_subset(subsets_path[i], r, M, dist_weight, add_nei)