init update

2a1b1ca5 · manxilin · 2a1b1ca5 · 2a1b1ca5 · 2a1b1ca5 · 2a1b1ca5
Commit 2a1b1ca5 authored Dec 27, 2020 by manxilin
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+env/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# dotenv
+.env
+
+# virtualenv
+.venv
+venv/
+ENV/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+# editor
+*.sw?
+
+# data
+figs/
+results/
+tmp/
+log/
+runs/
+vis/
+
+data/
+data/REDDIT-MULTI-12K/
+data/COLLAB
+data/PROTEINS
+data/PROTEINS_full
+data/NCI1
+
+
--- a/README.md
+++ b/README.md
+## Author
+Manxi Lin s192230
+
+Mengge Hu s192113
+
+Guangya Shen s200104
+
+## Data set
+https://drive.google.com/file/d/1nTM9c4HgIeb6iFauLQABuGjqDGpc43iv/view?usp=sharing
+
+Unzip it in here
+
+## Check our main result
+- See `main.ipynb`
+- Proof of our result: `./screenshots`
\ No newline at end of file
--- a/aggregators.py
+++ b/aggregators.py
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+import random
+
+"""
+Set of modules for aggregating embeddings of neighbors.
+"""
+
+class MeanAggregator(nn.Module):
+    """
+    Aggregates a node's embeddings using mean of neighbors' embeddings
+    """
+    def __init__(self, features, cuda=False, gcn=False): 
+        """
+        Initializes the aggregator for a specific graph.
+
+        features -- function mapping LongTensor of node ids to FloatTensor of feature values.
+        cuda -- whether to use GPU
+        gcn --- whether to perform concatenation GraphSAGE-style, or add self-loops GCN-style
+        """
+
+        super(MeanAggregator, self).__init__()
+
+        self.features = features
+        self.cuda = cuda
+        self.gcn = gcn
+        
+    def forward(self, nodes, to_neighs, num_sample=10):
+        """
+        nodes --- list of nodes in a batch
+        to_neighs --- list of sets, each set is the set of neighbors for node in batch
+        num_sample --- number of neighbors to sample. No sampling if None.
+        """
+        # Local pointers to functions (speed hack)
+        _set = set
+        if not num_sample is None:
+            _sample = random.sample
+            samp_neighs = [_set(_sample(to_neigh, 
+                            num_sample,
+                            )) if len(to_neigh) >= num_sample else to_neigh for to_neigh in to_neighs]
+        else:
+            samp_neighs = to_neighs
+
+        if self.gcn:
+            samp_neighs = [samp_neigh + set([nodes[i]]) for i, samp_neigh in enumerate(samp_neighs)]
+        unique_nodes_list = list(set.union(*samp_neighs))
+        unique_nodes = {n:i for i,n in enumerate(unique_nodes_list)}
+        mask = Variable(torch.zeros(len(samp_neighs), len(unique_nodes))) # (n x n')
+        column_indices = [unique_nodes[n] for samp_neigh in samp_neighs for n in samp_neigh] # for each neigh, get index in unique_nodes_list  
+        row_indices = [i for i in range(len(samp_neighs)) for j in range(len(samp_neighs[i]))] # node index
+        mask[row_indices, column_indices] = 1 # adjacent matrix
+        if self.cuda:
+            mask = mask.cuda()
+        num_neigh = mask.sum(1, keepdim=True) # sum for each node (n x 1)
+        mask = mask.div(num_neigh)
+        if self.cuda:
+            embed_matrix = self.features(torch.LongTensor(unique_nodes_list).cuda())
+        else:
+            embed_matrix = self.features(torch.LongTensor(unique_nodes_list))
+        to_feats = mask.mm(embed_matrix) # mean over each node (AF)
+        return to_feats
--- a/backup.sh
+++ b/backup.sh
+# ENZYMES
+python -m train --datadir=data --bmname=ENZYMES --cuda=3 --max-nodes=100 --num-classes=6
+
+# ENZYMES - Diffpool
+python -m train --bmname=ENZYMES --assign-ratio=0.1 --hidden-dim=30 --output-dim=30 --cuda=1 --num-classes=6 --method=soft-assign
+
+# DD
+python -m train --datadir=data --bmname=DD --cuda=0 --max-nodes=500 --epochs=1000 --num-classes=2
+
+# DD - Diffpool
+python -m train --bmname=DD --assign-ratio=0.1 --hidden-dim=64 --output-dim=64 --cuda=1 --num-classes=2 --method=soft-assign
--- a/cross_val.py
+++ b/cross_val.py
+import networkx as nx
+import numpy as np
+import torch
+
+import pickle
+import random
+
+from graph_sampler import GraphSampler
+
+def prepare_val_data(graphs, args, val_idx, max_nodes=0):
+
+    random.shuffle(graphs)
+
+    val_size = int(np.ceil((len(graphs)*(1-args.train_ratio-args.test_ratio))))
+
+    train_graphs = graphs[:val_idx * val_size]
+    if val_idx < 9:
+        train_graphs = train_graphs + graphs[(val_idx+1) * val_size :]
+    val_graphs = graphs[val_idx*val_size: (val_idx+1)*val_size]
+    print('Num training graphs: ', len(train_graphs), 
+          '; Num validation graphs: ', len(val_graphs))
+
+    print('Number of graphs: ', len(graphs))
+    print('Number of edges: ', sum([G.number_of_edges() for G in graphs]))
+    print('Max, avg, std of graph size: ', 
+            max([G.number_of_nodes() for G in graphs]), ', '
+            "{0:.2f}".format(np.mean([G.number_of_nodes() for G in graphs])), ', '
+            "{0:.2f}".format(np.std([G.number_of_nodes() for G in graphs])))
+
+    # minibatch
+    dataset_sampler = GraphSampler(train_graphs, normalize=False, max_num_nodes=max_nodes,
+            features=args.feature_type)
+    train_dataset_loader = torch.utils.data.DataLoader(
+            dataset_sampler, 
+            batch_size=args.batch_size, 
+            shuffle=True,
+            num_workers=args.num_workers)
+    dataset_sampler = GraphSampler(val_graphs, normalize=False, max_num_nodes=max_nodes,
+            features=args.feature_type)
+    val_dataset_loader = torch.utils.data.DataLoader(
+            dataset_sampler, 
+            batch_size=args.batch_size, 
+            shuffle=False,
+            num_workers=args.num_workers)
+
+    return train_dataset_loader, val_dataset_loader, \
+            dataset_sampler.max_num_nodes, dataset_sampler.feat_dim, dataset_sampler.assign_feat_dim
+
--- a/encoders.py
+++ b/encoders.py
--- a/encoders_backup_even_channel.py
+++ b/encoders_backup_even_channel.py
--- a/example.sh
+++ b/example.sh
+# ENZYMES
+# python -m train --datadir=data --bmname=ENZYMES --cuda=0 --max-nodes=100 --num-classes=6
+
+# ENZYMES - Diffpool
+#python -m train --bmname=ENZYMES --assign-ratio=0.1 --hidden-dim=30 --output-dim=30 --cuda=0 --epochs=1 --num-classes=6 --method=soft-assign --dropout=0.8
+
+# DD
+# python -m train --datadir=data --bmname=DD --cuda=0 --max-nodes=500 --epochs=1000 --num-classes=2
+
+# DD - Diffpool
+#python -m train --bmname=DD --assign-ratio=0.1 --hidden-dim=64 --output-dim=64 --cuda=0 --num-classes=2 --method=soft-assign --epochs=1
+
+!python -m train --bmname=DD --batch-size=30 \
+--dropout=0.3 --assign-ratio=0.5 --unpool-ratio=0.5 \
+--hidden-dim=64 --output-dim=64 --cuda=0 --num-classes=2 \
+--method=soft-assign --num-pool=3 --epochs=100 --num-unpool=3 \
+--weight-decay=0
\ No newline at end of file
--- a/gen/data.py
+++ b/gen/data.py
+import networkx as nx
+import numpy as np
+import random
+
+import gen.feat as featgen
+import util
+
+def gen_ba(n_range, m_range, num_graphs, feature_generator=None):
+    graphs = []
+    for i in np.random.choice(n_range, num_graphs):
+        for j in np.random.choice(m_range, 1):
+            graphs.append(nx.barabasi_albert_graph(i,j))
+
+    if feature_generator is None:
+        feature_generator = ConstFeatureGen(0)
+    for G in graphs:
+        feature_generator.gen_node_features(G)
+    return graphs
+
+def gen_er(n_range, p, num_graphs, feature_generator=None):
+    graphs = []
+    for i in np.random.choice(n_range, num_graphs):
+        graphs.append(nx.erdos_renyi_graph(i,p))
+
+    if feature_generator is None:
+        feature_generator = ConstFeatureGen(0)
+    for G in graphs:
+        feature_generator.gen_node_features(G)
+    return graphs
+
+def gen_2community_ba(n_range, m_range, num_graphs, inter_prob, feature_generators):
+    ''' Each community is a BA graph.
+    Args:
+        inter_prob: probability of one node connecting to any node in the other community.
+    '''
+
+    if feature_generators is None:
+        mu0 = np.zeros(10)
+        mu1 = np.ones(10)
+        sigma0 = np.ones(10, 10) * 0.1
+        sigma1 = np.ones(10, 10) * 0.1
+        fg0 = GaussianFeatureGen(mu0, sigma0)
+        fg1 = GaussianFeatureGen(mu1, sigma1)
+    else:
+        fg0 = feature_generators[0]
+        fg1 = feature_generators[1] if len(feature_generators) > 1 else feature_generators[0]
+
+    graphs1 = []
+    graphs2 = []
+    #for (i1, i2) in zip(np.random.choice(n_range, num_graphs), 
+    #                    np.random.choice(n_range, num_graphs)):
+    #    for (j1, j2) in zip(np.random.choice(m_range, num_graphs), 
+    #                        np.random.choice(m_range, num_graphs)):
+    graphs0 = gen_ba(n_range, m_range, num_graphs, fg0)
+    graphs1 = gen_ba(n_range, m_range, num_graphs, fg1)
+    graphs = []
+    for i in range(num_graphs):
+        G = nx.disjoint_union(graphs0[i], graphs1[i])
+        n0 = graphs0[i].number_of_nodes()
+        for j in range(n0):
+            if np.random.rand() < inter_prob:
+                target = np.random.choice(G.number_of_nodes() - n0) + n0
+                G.add_edge(j, target)
+        graphs.append(G)
+    return graphs
+
+def gen_2hier(num_graphs, num_clusters, n, m_range, inter_prob1, inter_prob2, feat_gen):
+    ''' Each community is a BA graph.
+    Args:
+        inter_prob1: probability of one node connecting to any node in the other community within
+            the large cluster.
+        inter_prob2: probability of one node connecting to any node in the other community between
+            the large cluster.
+    '''
+    graphs = []
+
+    for i in range(num_graphs):
+        clusters2 = []
+        for j in range(len(num_clusters)):
+            clusters = gen_er(range(n, n+1), 0.5, num_clusters[j], feat_gen[0])
+            G = nx.disjoint_union_all(clusters)
+            for u1 in range(G.number_of_nodes()):
+                if np.random.rand() < inter_prob1:
+                    target = np.random.choice(G.number_of_nodes() - n)
+                    # move one cluster after to make sure it's not an intra-cluster edge
+                    if target // n >= u1 // n:
+                        target += n
+                    G.add_edge(u1, target)
+            clusters2.append(G)
+        G = nx.disjoint_union_all(clusters2)
+        cluster_sizes_cum = np.cumsum([cluster2.number_of_nodes() for cluster2 in clusters2])
+        curr_cluster = 0
+        for u1 in range(G.number_of_nodes()):
+            if u1 >= cluster_sizes_cum[curr_cluster]:
+                curr_cluster += 1
+            if np.random.rand() < inter_prob2:
+                target = np.random.choice(G.number_of_nodes() -
+                        clusters2[curr_cluster].number_of_nodes())
+                # move one cluster after to make sure it's not an intra-cluster edge
+                if curr_cluster == 0 or target >= cluster_sizes_cum[curr_cluster - 1]:
+                    target += cluster_sizes_cum[curr_cluster]
+            G.add_edge(u1, target)
+        graphs.append(G)
+
+    return graphs
+
--- a/gen/feat.py
+++ b/gen/feat.py
+import abc
+import networkx as nx
+import numpy as np
+import random
+
+class FeatureGen(metaclass=abc.ABCMeta):
+    @abc.abstractmethod
+    def gen_node_features(self, G):
+        pass
+
+class ConstFeatureGen(FeatureGen):
+    def __init__(self, val):
+        self.val = val
+
+    def gen_node_features(self, G):
+        feat_dict = {i:{'feat': self.val} for i in G.nodes()}
+        nx.set_node_attributes(G, feat_dict)
+
+class GaussianFeatureGen(FeatureGen):
+    def __init__(self, mu, sigma):
+        self.mu = mu
+        self.sigma = sigma
+
+    def gen_node_features(self, G):
+        feat = np.random.multivariate_normal(mu, sigma, G.number_of_nodes())
+        feat_dict = {i:{'feat': feat[i]} for i in range(feat.shape[0])}
+        nx.set_node_attributes(G, feat_dict)
+
--- a/graph_embedding.py
+++ b/graph_embedding.py
+
--- a/graph_sampler.py
+++ b/graph_sampler.py
+import networkx as nx
+import numpy as np
+import torch
+import torch.utils.data
+
+import util
+
+class GraphSampler(torch.utils.data.Dataset):
+    ''' Sample graphs and nodes in graph
+    '''
+    def __init__(self, G_list, features='default', normalize=True, assign_feat='default', max_num_nodes=0):
+        self.adj_all = []
+        self.len_all = []
+        self.feature_all = []
+        self.label_all = []
+        
+        self.assign_feat_all = []
+
+        if max_num_nodes == 0:
+            self.max_num_nodes = max([G.number_of_nodes() for G in G_list])
+        else:
+            self.max_num_nodes = max_num_nodes
+
+        self.feat_dim = util.node_dict(G_list[0])[0]['feat'].shape[0]
+
+        for G in G_list:
+            adj = np.array(nx.to_numpy_matrix(G))
+            if normalize:
+                sqrt_deg = np.diag(1.0 / np.sqrt(np.sum(adj, axis=0, dtype=float).squeeze()))
+                adj = np.matmul(np.matmul(sqrt_deg, adj), sqrt_deg)
+            self.adj_all.append(adj)
+            self.len_all.append(G.number_of_nodes())
+            self.label_all.append(G.graph['label'])
+            # feat matrix: max_num_nodes x feat_dim
+            if features == 'default':
+                f = np.zeros((self.max_num_nodes, self.feat_dim), dtype=float)
+                for i,u in enumerate(G.nodes()):
+                    f[i,:] = util.node_dict(G)[u]['feat']
+                self.feature_all.append(f)
+            elif features == 'id':
+                self.feature_all.append(np.identity(self.max_num_nodes))
+            elif features == 'deg-num':
+                degs = np.sum(np.array(adj), 1)
+                degs = np.expand_dims(np.pad(degs, [0, self.max_num_nodes - G.number_of_nodes()], 0),
+                                      axis=1)
+                self.feature_all.append(degs)
+            elif features == 'deg':
+                self.max_deg = 10
+                degs = np.sum(np.array(adj), 1).astype(int)
+                degs[degs>max_deg] = max_deg
+                feat = np.zeros((len(degs), self.max_deg + 1))
+                feat[np.arange(len(degs)), degs] = 1
+                feat = np.pad(feat, ((0, self.max_num_nodes - G.number_of_nodes()), (0, 0)),
+                        'constant', constant_values=0)
+
+                f = np.zeros((self.max_num_nodes, self.feat_dim), dtype=float)
+                for i,u in enumerate(util.node_iter(G)):
+                    f[i,:] = util.node_dict(G)[u]['feat']
+
+                feat = np.concatenate((feat, f), axis=1)
+
+                self.feature_all.append(feat)
+            elif features == 'struct':
+                self.max_deg = 10
+                degs = np.sum(np.array(adj), 1).astype(int)
+                degs[degs>10] = 10
+                feat = np.zeros((len(degs), self.max_deg + 1))
+                feat[np.arange(len(degs)), degs] = 1
+                degs = np.pad(feat, ((0, self.max_num_nodes - G.number_of_nodes()), (0, 0)),
+                        'constant', constant_values=0)
+
+                clusterings = np.array(list(nx.clustering(G).values()))
+                clusterings = np.expand_dims(np.pad(clusterings, 
+                                                    [0, self.max_num_nodes - G.number_of_nodes()],
+                                                    'constant'),
+                                             axis=1)
+                g_feat = np.hstack([degs, clusterings])
+                if 'feat' in util.node_dict(G)[0]:
+                    node_feats = np.array([util.node_dict(G)[i]['feat'] for i in range(G.number_of_nodes())])
+                    node_feats = np.pad(node_feats, ((0, self.max_num_nodes - G.number_of_nodes()), (0, 0)),
+                                        'constant')
+                    g_feat = np.hstack([g_feat, node_feats])
+
+                self.feature_all.append(g_feat)
+
+            if assign_feat == 'id':
+                self.assign_feat_all.append(
+                        np.hstack((np.identity(self.max_num_nodes), self.feature_all[-1])) )
+            else:
+                self.assign_feat_all.append(self.feature_all[-1])
+            
+        self.feat_dim = self.feature_all[0].shape[1]
+        self.assign_feat_dim = self.assign_feat_all[0].shape[1]
+
+    def __len__(self):
+        return len(self.adj_all)
+
+    def __getitem__(self, idx):
+        adj = self.adj_all[idx]
+        num_nodes = adj.shape[0]
+        adj_padded = np.zeros((self.max_num_nodes, self.max_num_nodes))
+        adj_padded[:num_nodes, :num_nodes] = adj
+
+        # use all nodes for aggregation (baseline)
+
+        return {'adj':adj_padded,
+                'feats':self.feature_all[idx].copy(),
+                'label':self.label_all[idx],
+                'num_nodes': num_nodes,
+                'assign_feats':self.assign_feat_all[idx].copy()}
+
--- a/graphsage.py
+++ b/graphsage.py
+import torch
+import torch.nn as nn
+from torch.autograd import Variable
+
+import numpy as np
+
+class SupervisedGraphSage(nn.Module):
+    ''' GraphSage embeddings
+    '''
+
+    def __init__(self, num_classes, enc):
+        super(SupervisedGraphSage, self).__init__()
+        self.enc = enc
+        self.xent = nn.CrossEntropyLoss()
+
+        self.weight = nn.Parameter(torch.FloatTensor(enc.embed_dim, num_classes))
+        init.xavier_uniform(self.weight)
+
+    def forward(self, nodes):
+        embeds = self.enc(nodes)
+        scores = embeds.mm(self.weight)
+        return scores
+
+    def loss(self, nodes, labels):
+        scores = self.forward(nodes)
+        return self.xent(nn.softmax(scores), labels.squeeze())
+
--- a/load_data.py
+++ b/load_data.py
+import networkx as nx
+import numpy as np
+import scipy as sc
+import os
+import re
+
+import util
+
+def read_graphfile(datadir, dataname, max_nodes=None):
+    ''' Read data from https://ls11-www.cs.tu-dortmund.de/staff/morris/graphkerneldatasets
+        graph index starts with 1 in file
+
+    Returns:
+        List of networkx objects with graph and node labels
+    '''
+    prefix = os.path.join(datadir, dataname, dataname)
+    filename_graph_indic = prefix + '_graph_indicator.txt'
+    # index of graphs that a given node belongs to
+    graph_indic={}
+    with open(filename_graph_indic) as f: # no problem
+        i=1
+        for line in f:
+            line=line.strip("\n")
+            graph_indic[i]=int(line)
+            i+=1
+
+    filename_nodes=prefix + '_node_labels.txt'
+    node_labels=[]
+    try:
+        with open(filename_nodes) as f:
+            for line in f:
+                line=line.strip("\n")
+                node_labels+=[int(line) - 1]
+        num_unique_node_labels = max(node_labels) + 1
+    except IOError:
+        print('No node labels')
+
+ 
+    filename_node_attrs=prefix + '_node_attributes.txt'
+    node_attrs=[]
+    try:
+        with open(filename_node_attrs) as f:
+            for line in f:
+                line = line.strip("\s\n")
+                attrs = [float(attr) for attr in re.split("[,\s]+", line) if not attr == '']
+                node_attrs.append(np.array(attrs))
+    except IOError:
+        print('No node attributes')
+       
+    label_has_zero = False
+    filename_graphs=prefix + '_graph_labels.txt'
+    graph_labels=[]
+
+    # assume that all graph labels appear in the dataset 
+    #(set of labels don't have to be consecutive)
+    label_vals = []
+
+    with open(filename_graphs) as f:
+        for line in f:
+            line=line.strip("\n")
+            val = int(line)
+            #if val == 0:
+            #    label_has_zero = True
+            if val not in label_vals:
+                label_vals.append(val)
+            graph_labels.append(val)
+    #graph_labels = np.array(graph_labels)
+    label_map_to_int = {val: i for i, val in enumerate(label_vals)}
+    graph_labels = np.array([label_map_to_int[l] for l in graph_labels])
+    #if label_has_zero:
+    #    graph_labels += 1
+    
+    filename_adj=prefix + '_A.txt'
+    adj_list={i:[] for i in range(1,len(graph_labels)+1)}    
+    index_graph={i:[] for i in range(1,len(graph_labels)+1)}
+    num_edges = 0
+    with open(filename_adj) as f:
+        for line in f:
+            line=line.strip("\n").split(",")
+            e0,e1=(int(line[0].strip(" ")),int(line[1].strip(" ")))
+            adj_list[graph_indic[e0]].append((e0,e1))
+            index_graph[graph_indic[e0]]+=[e0,e1]
+            num_edges += 1
+    for k in index_graph.keys():
+        index_graph[k]=[u-1 for u in set(index_graph[k])]
+
+    graphs=[]
+    for i in range(1,1+len(adj_list)):
+        # indexed from 1 here
+
+        G=nx.from_edgelist(adj_list[i])
+        if max_nodes is not None and G.number_of_nodes() > max_nodes:
+            continue
+      
+        # add features and labels
+        G.graph['label'] = graph_labels[i-1]
+        for u in util.node_iter(G):
+            if len(node_labels) > 0:
+                node_label_one_hot = [0] * num_unique_node_labels
+                node_label = node_labels[u-1]
+                node_label_one_hot[node_label] = 1
+                util.node_dict(G)[u]['label'] = node_label_one_hot
+            if len(node_attrs) > 0:
+                util.node_dict(G)[u]['feat'] = node_attrs[u-1]
+        if len(node_attrs) > 0:
+            G.graph['feat_dim'] = node_attrs[0].shape[0]
+
+        # relabeling
+        mapping={}
+        it=0
+        for n in util.node_iter(G):
+            mapping[n]=it
+            it+=1
+            
+        # indexed from 0
+        graphs.append(nx.relabel_nodes(G, mapping))
+    return graphs
+
--- a/main.ipynb
+++ b/main.ipynb
--- a/partition.py
+++ b/partition.py
+import networkx
+import numpy as np
+
+def partition(embeddings):
+    ''' Compute a partition of embeddings, where each partition is pooled together.
+    Args:
+        embeddings: N-by-D matrix, where N is the number of node embeddings, and D
+            is the embedding dimension.
+    '''
+    dist = np.dot(embeddings)
+    
+def kruskal(adj):
+    # initialize MST
+    MST = set()
+    edges = set()
+    num_nodes = adj.shape[0]
+    # collect all edges from graph G
+    for j in range(num_nodes):
+        for k in range(num_nodes):
+            if G.graph[j][k] != 0 and (k, j) not in edges:
+                edges.add((j, k))
+    # sort all edges in graph G by weights from smallest to largest
+    sorted_edges = sorted(edges, key=lambda e:G.graph[e[0]][e[1]])
+    uf = UF(G.vertices)
+    for e in sorted_edges:
+        u, v = e
+        # if u, v already connected, abort this edge
+        if uf.connected(u, v):
+            continue
+        # if not, connect them and add this edge to the MST
+        uf.union(u, v)
+        MST.add(e)
+    return MST
+
--- a/poster.pdf
+++ b/poster.pdf
--- a/sample_data.py
+++ b/sample_data.py
+'''
+author: lmx
+date: 11/17/2020
+'''
+
+# importation
+import networkx as nx
+import numpy
+import sys
+from glob import glob
+from load_data import read_graphfile
+from matplotlib import pyplot as plt
+import torch
+import math
+from collections import Counter
+
+#----------CONSTANTS----------
+DATA_DIR = './data/' # parent folder 
+DATA_NAME = 'DD'
+############
+NEW_DIR = DATA_DIR # new path
+NAME = 'DDD' # new name
+NUM = 10 # new graph number
+############
+
+
+# fetch graphs
+# g = read_graphfile(DATA_DIR, DATA_NAME)
+# assert NUM <= len(g),'exceed maximum graph number'
+# nodes = list(map(lambda x: len(x.nodes), g))
+# edges = list(map(lambda x: len(x.edges), g))
+
+#-> PART 1
+# fetch basic information 
+base = '{}{}/{}_graph_indicator.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+
+with open(old_dir, 'r') as f:
+    c = f.read()
+a = c.split('\n')
+cnter = Counter(a)
+cnter = list(zip(cnter.values(), cnter.keys()))
+cnter = cnter[:-1]
+
+def foo(x):
+    return eval(x[1])
+cnter = list(sorted(cnter, key=foo))
+cnter = list(map(lambda x: x[0], cnter))
+
+base = '{}{}/{}_A.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+
+with open(old_dir, 'r') as f:
+    c = f.read()
+old_adj = c.split('\n')
+
+def foo(x):
+    x = x.split(',')
+    try:
+        assert 2 == len(x)
+        return [eval(x[0]), eval(x[1])]
+    except AssertionError:
+        pass
+    
+old_adj = list(map(foo, old_adj))
+old_adj = old_adj[:-1]
+
+NODE_NUM = sum(list(cnter[:NUM]))
+
+i = 0 
+for i in range(len(old_adj)):
+    tmp = old_adj[i]
+    e0, e1 = tmp[0], tmp[1]
+    if max([e0, e1]) > NODE_NUM:
+        break
+
+EDGE_NUM = i
+
+print('There are {} graphs, {} nodes and {} edges.'.format(NUM, NODE_NUM, EDGE_NUM))
+
+#-> PART 2
+# Adjacent matrix
+base = '{}{}/{}_A.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+with open(old_dir, 'r') as f:
+    c = f.read()
+old_adj = c.split('\n')
+new_adj = old_adj[:EDGE_NUM]
+new_adj = '\n'.join(new_adj)
+with open(new_dir, 'w') as f:
+    f.write(new_adj)
+
+print('Write Adjacent Matrix in {}'.format(new_dir))
+
+
+#-> PART 3
+# Graph indicator
+
+base = '{}{}/{}_graph_indicator.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+
+with open(old_dir, 'r') as f:
+    c = f.read()
+old_ind = c.split('\n')
+new_ind = old_ind[:NODE_NUM]
+new_ind = '\n'.join(new_ind)
+with open(new_dir, 'w') as f:
+    f.write(new_ind)
+
+print('Write Graph Indicator in {}'.format(new_dir))
+
+
+#-> PART 4
+# Graph labels
+
+base = '{}{}/{}_graph_labels.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+
+with open(old_dir, 'r') as f:
+    c = f.read()
+old_label = c.split('\n')
+new_label = old_label[:NUM]
+new_label = '\n'.join(new_label)
+with open(new_dir, 'w') as f:
+    f.write(new_label)   
+
+print('Write Graph Labels in {}'.format(new_dir))
+
+#-> PART 5
+# Node labels
+base = '{}{}/{}_node_labels.txt'
+old_dir = base.format(DATA_DIR, DATA_NAME, DATA_NAME)
+new_dir = base.format(NEW_DIR, NAME, NAME)
+
+with open(old_dir, 'r') as f:
+    c = f.read()
+old_label = c.split('\n')
+new_label = old_label[:NODE_NUM]
+new_label = '\n'.join(new_label)
+with open(new_dir, 'w') as f:
+    f.write(new_label)   
+
+print('Write Node Labels in {}'.format(new_dir))
\ No newline at end of file
--- a/set2set.py
+++ b/set2set.py
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+
+import numpy as np
+
+class Set2Set(nn.Module):
+    def __init__(self, input_dim, hidden_dim, act_fn=nn.ReLU, num_layers=1):
+        '''
+        Args:
+            input_dim: input dim of Set2Set. 
+            hidden_dim: the dim of set representation, which is also the INPUT dimension of 
+                the LSTM in Set2Set. 
+                This is a concatenation of weighted sum of embedding (dim input_dim), and the LSTM
+                hidden/output (dim: self.lstm_output_dim).
+        '''
+        super(Set2Set, self).__init__()
+        self.input_dim = input_dim
+        self.hidden_dim = hidden_dim
+        self.num_layers = num_layers
+        if hidden_dim <= input_dim:
+            print('ERROR: Set2Set output_dim should be larger than input_dim')
+        # the hidden is a concatenation of weighted sum of embedding and LSTM output
+        self.lstm_output_dim = hidden_dim - input_dim
+        self.lstm = nn.LSTM(hidden_dim, input_dim, num_layers=num_layers, batch_first=True)
+
+        # convert back to dim of input_dim
+        self.pred = nn.Linear(hidden_dim, input_dim)
+        self.act = act_fn()
+
+    def forward(self, embedding):
+        '''
+        Args:
+            embedding: [batch_size x n x d] embedding matrix
+        Returns:
+            aggregated: [batch_size x d] vector representation of all embeddings
+        '''
+        batch_size = embedding.size()[0]
+        n = embedding.size()[1]
+
+        hidden = (torch.zeros(self.num_layers, batch_size, self.lstm_output_dim).cuda(),
+                  torch.zeros(self.num_layers, batch_size, self.lstm_output_dim).cuda())
+
+        q_star = torch.zeros(batch_size, 1, self.hidden_dim).cuda()
+        for i in range(n):
+            # q: batch_size x 1 x input_dim
+            q, hidden = self.lstm(q_star, hidden)
+            # e: batch_size x n x 1
+            e = embedding @ torch.transpose(q, 1, 2)
+            a = nn.Softmax(dim=1)(e)
+            r = torch.sum(a * embedding, dim=1, keepdim=True)
+            q_star = torch.cat((q, r), dim=2)
+        q_star = torch.squeeze(q_star, dim=1)
+        out = self.act(self.pred(q_star))
+
+        return out
--- a/test.py
+++ b/test.py
+from os import read
+import torch
+import torch.nn as nn
+from torch.nn import init
+import torch.nn.functional as F
+
+import numpy as np
+from torch.nn.modules.activation import ReLU
+
+from set2set import Set2Set
+
+from encoders import SoftPoolingGcnEncoder
+
+# GCN basic operation
+class GraphConv(nn.Module):
+    def __init__(self, input_dim, output_dim, add_self=False, normalize_embedding=False,
+            dropout=0.0, bias=True):
+        super(GraphConv, self).__init__()
+        self.add_self = add_self
+        self.dropout = dropout
+        if dropout > 0.001:
+            self.dropout_layer = nn.Dropout(p=dropout)
+        self.normalize_embedding = normalize_embedding
+        self.input_dim = input_dim
+        self.output_dim = output_dim
+        self.weight = nn.Parameter(torch.FloatTensor(input_dim, output_dim).cuda())
+        if bias:
+            self.bias = nn.Parameter(torch.FloatTensor(output_dim).cuda())
+        else:
+            self.bias = None
+
+    def forward(self, x, adj):
+        if self.dropout > 0.001:
+            x = self.dropout_layer(x)
+        y = torch.matmul(adj, x)
+        if self.add_self:
+            y += x
+        y = torch.matmul(y,self.weight)
+        if self.bias is not None:
+            y = y + self.bias
+        if self.normalize_embedding:
+            y = F.normalize(y, p=2, dim=2)
+            #print(y[0][0])
+        return y
+
+class GConvModule(nn.Module):
+    def __init__(self, input_dim, hidden_dim, embedding_dim, label_dim, num_layers, 
+    pred_hidden_dims=[], concat=True, bn=True, dropout=0.0, normalize=False, num_aggs=1, 
+    args=None):
+
+        super(GConvModule, self).__init__()
+
+        add_self = not concat
+
+        self.conv_first = GraphConv(input_dim=input_dim, output_dim=hidden_dim, add_self=add_self,
+        normalize_embedding=normalize, bias=True)
+
+        self.conv_block = nn.ModuleList(
+                [GraphConv(input_dim=hidden_dim, output_dim=hidden_dim, add_self=add_self,
+                        normalize_embedding=normalize, dropout=dropout, bias=True) 
+                 for i in range(num_layers-2)])
+
+        self.conv_last = GraphConv(input_dim=hidden_dim, output_dim=embedding_dim, add_self=add_self,
+                normalize_embedding=normalize, bias=True)
+
+        
+        self.act = nn.ReLU()
+        self.bn = bn
+        self.num_aggs = num_aggs
+        self.concat = concat
+
+        if concat:
+            pred_input_dim = hidden_dim * (num_layers - 1) + embedding_dim
+        else:
+            pred_input_dim = embedding_dim
+        
+        pred_input_dim = pred_input_dim * num_aggs
+
+        if len(pred_hidden_dims) == 0:
+            pred_model = nn.Linear(pred_input_dim, label_dim)
+        else:
+            pred_layers = []
+            for pred_dim in pred_hidden_dims:
+                pred_layers.append(nn.Linear(pred_input_dim, pred_dim))
+                pred_layers.append(self.act)
+                pred_input_dim = pred_dim
+            pred_layers.append(nn.Linear(pred_dim, label_dim))
+            pred_model = nn.Sequential(*pred_layers)
+        
+        self.pred_block = pred_model
+
+    def apply_bn(self, x):
+        ''' Batch normalization of 3D tensor x
+        '''
+        bn_module = nn.BatchNorm1d(x.size()[1]).cuda()
+        return bn_module(x)
+
+    def forward(self, x, adj, embedding_mask=None):
+
+        x = self.conv_first(x, adj)
+        x = self.act(x)
+        if self.bn:
+            x = self.apply_bn(x)
+        x_all = [x]
+
+        for i in range(len(self.conv_block)):
+            x = self.conv_block[i](x,adj)
+            x = self.act(x)
+            if self.bn:
+                x = self.apply_bn(x)
+            x_all.append(x)
+        x = self.conv_last(x,adj)
+        x_all.append(x)
+
+        # x_tensor: [batch_size x num_nodes x embedding]
+        x_tensor = torch.cat(x_all, dim=2)
+
+        if embedding_mask is not None:
+            x_tensor = x_tensor * embedding_mask
+
+        ypred = self.pred_block(x_tensor)
+        
+        return ypred
+
+
+if __name__=='__main__':
+    x = torch.rand(20, 100, 10).cuda()
+    adj = torch.rand(20, 100, 100).cuda()
+
+    #net = GConvModule(10, 10, 5, 3, num_layers=5)
+    net = SoftPoolingGcnEncoder(100, 10, 10, 5, 3, 5, 5, num_pooling=2, 
+    assign_ratio=0.1, num_unpooling=2, unpool_ratio=0.1)
+    net = net.cuda()
+
+    a = net.forward(x, adj, range(20), True, True)
+    print(a)
+
+    '''
+    from load_data import read_graphfile
+    g = read_graphfile('./data', 'DND')
+    print(len(g))
+    print(g)
+    '''
+