{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.linear_model import SGDClassifier\n",
"from sklearn.utils import shuffle\n",
"from tqdm import tqdm\n",
"import pandas as pd\n",
"from collections import Counter\n",
"import random\n",
"import os"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"random.seed(0)\n",
"np.random.seed(0)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"config = {\n",
" #embedding computation\n",
" 'cleora_n_iter': 5,\n",
" 'cleora_dim': 1024,\n",
" \n",
" #dataset preparation\n",
" 'train_test_split': 0.2,\n",
" \n",
" 'batch_size': 256,\n",
" 'test_batch_size': 1000,\n",
" 'epochs': [2],\n",
" 'alpha': [1e-4],\n",
"}"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Dataset preparation"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html -> ```\n",
"wget https://snap.stanford.edu/data/facebook_large.zip\n",
"```\n",
"2. Extract the dataset to ./facebook_large/\n",
"\n",
"Other datasets from SNAP can be preprocessed similarly."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"df = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id_1 | \n",
" id_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0 | \n",
" 18427 | \n",
"
\n",
" \n",
" 1 | \n",
" 1 | \n",
" 21708 | \n",
"
\n",
" \n",
" 2 | \n",
" 1 | \n",
" 22208 | \n",
"
\n",
" \n",
" 3 | \n",
" 1 | \n",
" 22171 | \n",
"
\n",
" \n",
" 4 | \n",
" 1 | \n",
" 6829 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id_1 id_2\n",
"0 0 18427\n",
"1 1 21708\n",
"2 1 22208\n",
"3 1 22171\n",
"4 1 6829"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"train, test = train_test_split(df, test_size=config['train_test_split'])"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" id_1 | \n",
" id_2 | \n",
"
\n",
" \n",
" \n",
" \n",
" 144023 | \n",
" 13582 | \n",
" 17833 | \n",
"
\n",
" \n",
" 137388 | \n",
" 8090 | \n",
" 16502 | \n",
"
\n",
" \n",
" 128146 | \n",
" 11323 | \n",
" 20700 | \n",
"
\n",
" \n",
" 92044 | \n",
" 7263 | \n",
" 16965 | \n",
"
\n",
" \n",
" 19 | \n",
" 1 | \n",
" 16260 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" id_1 id_2\n",
"144023 13582 17833\n",
"137388 8090 16502\n",
"128146 11323 20700\n",
"92044 7263 16965\n",
"19 1 16260"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n",
"fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n",
"fb_lp_train_filename = \"fb_lp_train.txt\"\n",
"fb_lp_test_filename = \"fb_lp_test.txt\"\n",
"output_dir = 'output'"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star, open(fb_lp_train_filename, \"w\") as f_train:\n",
" grouped_train = train.groupby('id_1')\n",
" for n, (name, group) in enumerate(grouped_train):\n",
" group_list = group['id_2'].tolist()\n",
" group_elems = list(map(str, group_list))\n",
" f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n",
" f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n",
" for elem in group_elems:\n",
" f_train.write(\"{}\\t{}\\n\".format(name, elem))\n",
" f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"with open(fb_lp_test_filename, \"w\") as f_test:\n",
" grouped_test = test.groupby('id_1')\n",
" for name, group in grouped_test:\n",
" group_list = group['id_2'].tolist()\n",
" group_elems = list(map(str, group_list))\n",
" for elem in group_elems:\n",
" f_test.write(\"{}\\t{}\\n\".format(name, elem))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Cleora training"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n",
"\n",
"A Linux GNU version is assumed in this example, but any other will do."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"\n",
"\n",
"def columns2output_filename(output_dir, columns):\n",
" columns_split = columns.split()\n",
" if len(columns_split) == 1 and 'reflexive' in columns:\n",
" column_name = columns.split('::')[-1]\n",
" return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n",
"\n",
" column_names = [i.split('::')[-1] for i in columns_split]\n",
" return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n",
"\n",
"\n",
"def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n",
" command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n",
" '--columns', columns,\n",
" '--dimension', str(dim), \n",
" '-n', str(n_iter), \n",
" '--input', input_filename, \n",
" '-o', output_dir]\n",
" subprocess.run(command, check=True)\n",
" return columns2output_filename(output_dir, columns)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Star expansion\n",
"\n",
"In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 7.37 ms, sys: 4.02 ms, total: 11.4 ms\n",
"Wall time: 5.32 s\n"
]
}
],
"source": [
"%%time\n",
"cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Clique expansion\n",
"\n",
"The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme."
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.71 ms, sys: 4.71 ms, total: 10.4 ms\n",
"Wall time: 8.42 s\n"
]
}
],
"source": [
"%%time\n",
"cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## No expansion\n",
"\n",
"You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Link Prediction\n",
"\n",
"In the link prediction task, we train a binary classifier to distinguish real edges from fake edges. Real edges are composed of pairs of nodes from train/test set, while fake edges are built in two ways depending on whether we're training or testing:\n",
"\n",
"1. In training: we draw random pairs of edges\n",
"2. In testing: we take a valid pair of nodes (nodeA-nodeB) from the testset. Then we pair nodeA to 10.000 most common nodes in the dataset. This way, we obtain 1 real and 10.000 fake examples.\n",
"\n",
"We compute a Hadamard product between pairs of embeddings. As a result, we obtain a single vector for each embedding pair, which represents an approximation of node similarity. We feed the products as inputs to the classifier.\n",
"\n",
"At test time, we compute ranking metrics of the correct prediction among the 10.001 given pairs of nodes. We compute the mean reciprocal rank measure (MRR) and hit rate in top 10 predictins (HR@10)."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"def read_embeddings(input_file):\n",
" df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n",
" index_col=0)\n",
" df_full = df_full.drop([1], axis=1) \n",
" return df_full"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [],
"source": [
"def read_train_test(embeddings):\n",
" valid_idx = embeddings.index.to_numpy()\n",
" train = np.loadtxt(fb_lp_train_filename, delimiter=\"\\t\", dtype=np.int)\n",
" test = np.loadtxt(fb_lp_test_filename, delimiter=\"\\t\", dtype=np.int)\n",
" \n",
" #valid pairs of nodes\n",
" train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n",
" test = test[np.isin(test[:,0], valid_idx) & np.isin(test[:,1], valid_idx)]\n",
" \n",
" #negatives for testset: top 10000 most common nodes\n",
" all_idx = train.flatten()\n",
" ctr = Counter(all_idx)\n",
" negatives = ctr.most_common(10000)\n",
" negatives = [ n[0] for n in negatives ]\n",
" \n",
" adjacency_dict = dict()\n",
" for inp, out in np.vstack([test, train]):\n",
" if inp not in adjacency_dict:\n",
" adjacency_dict[inp] = set()\n",
" adjacency_dict[inp].add(out)\n",
"\n",
" return train, test, negatives, adjacency_dict, valid_idx"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"batch_size = config['batch_size']\n",
"test_batch_size = config['test_batch_size']"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"\n"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 535/535 [00:11<00:00, 47.06it/s]\n",
"100%|██████████| 535/535 [00:09<00:00, 56.56it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"mrr 0.06616147762651434 hr@10 0.15\n",
"mrr 0.08095224568988774 hr@10 0.21\n",
"mrr 0.08122927128005851 hr@10 0.21\n",
"mrr 0.07496740499745976 hr@10 0.2175\n",
"mrr 0.07467647332139908 hr@10 0.216\n",
"mrr 0.07469468325977181 hr@10 0.20333333333333334\n",
"mrr 0.07155631268196187 hr@10 0.1957142857142857\n",
"mrr 0.07176231294260671 hr@10 0.19\n",
"mrr 0.07264605312117074 hr@10 0.19555555555555557\n",
"mrr 0.07792850344426648 hr@10 0.203\n",
"algo: output/emb__cluster_id__StarNode.out epochs: 2 lr: 0.0001, mrr: 0.07792850344426648, hr@10: 0.203\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 535/535 [00:10<00:00, 50.13it/s]\n",
"100%|██████████| 535/535 [00:10<00:00, 52.55it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"mrr 0.04620611596633895 hr@10 0.12\n",
"mrr 0.052375047344624 hr@10 0.125\n",
"mrr 0.04981016049216018 hr@10 0.13333333333333333\n",
"mrr 0.055700935501425936 hr@10 0.145\n",
"mrr 0.0654246642870071 hr@10 0.164\n",
"mrr 0.0630759077640286 hr@10 0.16666666666666666\n",
"mrr 0.061461758290300234 hr@10 0.1657142857142857\n",
"mrr 0.06278197507333488 hr@10 0.16625\n",
"mrr 0.06284700154608117 hr@10 0.1622222222222222\n",
"mrr 0.06326597231126035 hr@10 0.161\n",
"algo: output/emb__CliqueNode__CliqueNode.out epochs: 2 lr: 0.0001, mrr: 0.06326597231126035, hr@10: 0.161\n"
]
}
],
"source": [
"for algo in [cleora_output_star_filename, cleora_output_clique_filename]:\n",
" embeddings = read_embeddings(algo)\n",
" train_1, test_1, negatives, adjacency_dict, valid_idx = read_train_test(embeddings)\n",
" #for faster operation, draw only 1000 test examples\n",
" test_ex = random.sample(list(test_1), 1000)\n",
" \n",
" #these are the 10.000 most common nodes selected as negatives for each valid testing node pair\n",
" df_neg = embeddings.loc[negatives]\n",
" neg_ids = set(df_neg.index)\n",
"\n",
" epoch = max(config['epochs'])\n",
" for a in config['alpha']:\n",
" #create a binary classifier outputting whether a node pair represents a valid edge (1) or not a valid edge (0)\n",
" clf = SGDClassifier(random_state=0, loss='log', alpha=a)\n",
" for e in range(0, epoch):\n",
" np.random.shuffle(train_1)\n",
" \n",
" for idx in tqdm(range(0,train_1.shape[0],batch_size)):\n",
" #ones = real pairs of nodes\n",
" #zeros = fake pairs of nodes\n",
" ones=train_1[idx:min(idx+batch_size,train_1.shape[0]),:]\n",
" \n",
" ones_emb_in = embeddings.loc[ones[:,0]].to_numpy()\n",
" ones_emb_out = embeddings.loc[ones[:,1]].to_numpy()\n",
" #Hadamard\n",
" ones = np.multiply(ones_emb_in,ones_emb_out)\n",
" \n",
" id_train_0_in = np.random.choice(valid_idx, size=len(ones), replace=True)\n",
" id_train_0_out = np.random.choice(valid_idx, size=len(ones), replace=True)\n",
" \n",
" zeros_emb_in = embeddings.loc[id_train_0_in].to_numpy()\n",
" zeros_emb_out = embeddings.loc[id_train_0_out].to_numpy()\n",
" #Hadamard\n",
" zeros = np.multiply(zeros_emb_in, zeros_emb_out)\n",
" \n",
" x_train = np.vstack([ones, zeros])\n",
" y_train = [1]*len(ones) + [0]*len(ones)\n",
"\n",
" clf.partial_fit(x_train, y_train, classes=[0,1])\n",
"\n",
" if e+1 in config['epochs']:\n",
" mrr = 0.0\n",
" hr = 0.0\n",
" for n, ex in enumerate(test_ex):\n",
" l = ex[0]\n",
" r = ex[1]\n",
"\n",
" emb_l = embeddings.loc[l].to_numpy().reshape([1, -1])\n",
" emb_r = np.vstack((df_neg.to_numpy(), embeddings.loc[r].to_numpy()))\n",
" \n",
" full_ex = np.hstack([np.repeat(emb_l, len(emb_r), axis=0), emb_r])\n",
" hadamard = np.multiply(emb_l, emb_r)\n",
" preds = clf.predict_proba(hadamard)[:,1]\n",
" preds = np.array(preds)\n",
"\n",
" #do not punish for high scores of items from trainset and others from testset\n",
" forbidden_ex = adjacency_dict[l]\n",
" df_mask = [0 if (elem in forbidden_ex) else 1 for elem in neg_ids]\n",
" #last elem is always valid\n",
" df_mask.append(1)\n",
" preds *= df_mask\n",
" \n",
" ranking = (-preds).argsort()\n",
" rank = np.isin(ranking, 10000).nonzero()[0][0]+1\n",
" mrr += 1/rank\n",
" hr += (rank <= 10)\n",
" \n",
" if (n+1)%100 == 0:\n",
" print('mrr ', mrr/(n+1), ' hr@10 ', hr/(n+1))\n",
"\n",
" print('algo: {} epochs: {} lr: {}, mrr: {}, hr@10: {}'.format(algo, str(e+1), a, mrr/len(test_ex), hr/len(test_ex)))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}