{ "cells": [ { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.utils import shuffle\n", "from tqdm import tqdm\n", "import pickle as pkl\n", "import pandas as pd\n", "import random\n", "import sys\n", "import os\n", "from sklearn.metrics import f1_score" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "random.seed(0)\n", "np.random.seed(0)" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "config = {\n", " #embedding computation\n", " 'cleora_n_iter': 5,\n", " 'cleora_dim': 1024,\n", " \n", " #dataset preparation\n", " 'train_test_split': 0.2,\n", " \n", " #training classification\n", " 'input_embeddings': [\n", " 'output/emb__cluster_id__StarNode.out',\n", " 'output/emb__CliqueNode__CliqueNode.out',\n", " ],\n", " 'batch_size': 256,\n", " 'test_batch_size': 1000,\n", " 'epochs': [20],\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n", "2. Extract the dataset to ./facebook_large/\n", "3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], "source": [ "df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
id_1id_2
0018427
1121708
2122208
3122171
416829
\n", "
" ], "text/plain": [ " id_1 id_2\n", "0 0 18427\n", "1 1 21708\n", "2 1 22208\n", "3 1 22171\n", "4 1 6829" ] }, "execution_count": 149, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_cleora.head()" ] }, { "cell_type": "code", "execution_count": 150, "metadata": {}, "outputs": [], "source": [ "train_cleora, test_cleora = train_test_split(df_cleora, test_size=config['train_test_split'])" ] }, { "cell_type": "code", "execution_count": 151, "metadata": {}, "outputs": [], "source": [ "fb_cleora_input_clique_filename = \"fb_cleora_input_clique.txt\"\n", "fb_cleora_input_star_filename = \"fb_cleora_input_star.txt\"\n", "output_dir = 'output'" ] }, { "cell_type": "code", "execution_count": 152, "metadata": {}, "outputs": [], "source": [ "with open(fb_cleora_input_clique_filename, \"w\") as f_cleora_clique, open(fb_cleora_input_star_filename, \"w\") as f_cleora_star:\n", " grouped_train = train_cleora.groupby('id_1')\n", " for n, (name, group) in enumerate(grouped_train):\n", " group_list = group['id_2'].tolist()\n", " group_elems = list(map(str, group_list))\n", " f_cleora_clique.write(\"{} {}\\n\".format(name, ' '.join(group_elems)))\n", " f_cleora_star.write(\"{}\\t{}\\n\".format(n, name))\n", " for elem in group_elems:\n", " f_cleora_star.write(\"{}\\t{}\\n\".format(n, elem))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 153, "metadata": {}, "outputs": [], "source": [ "df = pd.read_csv(\"facebook_large/musae_facebook_target.csv\")" ] }, { "cell_type": "code", "execution_count": 154, "metadata": {}, "outputs": [], "source": [ "classes = df['page_type'].unique()\n", "class_ids = list(range(0, len(classes)))\n", "class_dict = {k:v for k,v in zip(classes, class_ids)}\n", "df['page_type'] = [class_dict[item] for item in df['page_type']] " ] }, { "cell_type": "code", "execution_count": 155, "metadata": {}, "outputs": [], "source": [ "train_filename = \"fb_classification_train.txt\"\n", "test_filename = \"fb_classification_test.txt\"" ] }, { "cell_type": "code", "execution_count": 156, "metadata": {}, "outputs": [], "source": [ "train, test = train_test_split(df, test_size=config['train_test_split'])" ] }, { "cell_type": "code", "execution_count": 157, "metadata": {}, "outputs": [], "source": [ "with open(train_filename, \"w\") as f_train:\n", " for index, row in train.iterrows():\n", " f_train.write(\"{} {}\\n\".format(row['id'], row['page_type']))" ] }, { "cell_type": "code", "execution_count": 158, "metadata": {}, "outputs": [], "source": [ "with open(test_filename, \"w\") as f_test:\n", " for index, row in test.iterrows():\n", " f_test.write(\"{} {}\\n\".format(row['id'], row['page_type']))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Cleora training" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Download an appropriate binary Cleora release from: https://github.com/Synerise/cleora/releases . \n", "\n", "A Linux GNU version is assumed in this example, but any other will do." ] }, { "cell_type": "code", "execution_count": 159, "metadata": {}, "outputs": [], "source": [ "import subprocess\n", "\n", "\n", "def columns2output_filename(output_dir, columns):\n", " columns_split = columns.split()\n", " if len(columns_split) == 1 and 'reflexive' in columns:\n", " column_name = columns.split('::')[-1]\n", " return os.path.join(output_dir, f'emb__{column_name}__{column_name}.out')\n", "\n", " column_names = [i.split('::')[-1] for i in columns_split]\n", " return os.path.join(output_dir, 'emb__' + '__'.join(column_names) + '.out')\n", "\n", "\n", "def train_cleora(dim, n_iter, columns, input_filename, output_dir):\n", " command = ['./cleora-v1.0.1-x86_64-unknown-linux-gnu',\n", " '--columns', columns,\n", " '--dimension', str(dim), \n", " '-n', str(n_iter), \n", " '--input', input_filename, \n", " '-o', output_dir]\n", " subprocess.run(command, check=True)\n", " return columns2output_filename(output_dir, columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Star expansion\n", "\n", "In the `fb_cleora_input_star.txt` file the first column is a virtual node. The parameter `-c \"transient::cluster_id node\"` means that embeddings will not be created for nodes from this column. This translates to star expansion scheme." ] }, { "cell_type": "code", "execution_count": 160, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 1.37 ms, sys: 8.1 ms, total: 9.47 ms\n", "Wall time: 8.59 s\n" ] } ], "source": [ "%%time\n", "cleora_output_star_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"transient::cluster_id StarNode\", fb_cleora_input_star_filename, output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Clique expansion\n", "\n", "The `fb_cleora_input_clique.txt` file has the structure of adjacency list. The parameter `-c \"complex::reflexive::node\"` means that edges will be created for all cominations of nodes from each line. This translates to clique expansion scheme." ] }, { "cell_type": "code", "execution_count": 161, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "CPU times: user 4.42 ms, sys: 8.34 ms, total: 12.8 ms\n", "Wall time: 13.7 s\n" ] } ], "source": [ "%%time\n", "cleora_output_clique_filename = train_cleora(config['cleora_dim'], config['cleora_n_iter'], \"complex::reflexive::CliqueNode\", fb_cleora_input_clique_filename, output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## No expansion\n", "\n", "You can also compute Cleora without any expansion scheme by providing an input file in the edgelist format (single pair of nodes per line). Run with a simple parameter: `-c \"node1 node2\"`." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Classification\n", "\n", "We train a simple multiclass Logistic Regression classifier to predict the class of node based on its embedding. We assess the quality of the classifier with of 2 metrics: micro-F1 and macro-F1." ] }, { "cell_type": "code", "execution_count": 162, "metadata": {}, "outputs": [], "source": [ "def read_embeddings(input_file):\n", " df_full = pd.read_csv(input_file, delimiter = \" \", skiprows=[0], header=None, \n", " index_col=0)\n", " df_full = df_full.drop([1], axis=1)\n", "\n", " return df_full" ] }, { "cell_type": "code", "execution_count": 163, "metadata": {}, "outputs": [], "source": [ "def read_train_test(embeddings):\n", " valid_idx = embeddings.index.to_numpy()\n", " \n", " train = np.loadtxt(train_filename, delimiter=\" \", dtype=np.int) \n", " test = np.loadtxt(test_filename, delimiter=\" \", dtype=np.int)\n", " \n", " train = train[np.isin(train[:,0], valid_idx) & np.isin(train[:,1], valid_idx)]\n", " test = [t for t in test if (t[0] in valid_idx) and (t[1] in valid_idx)] \n", " \n", " train = np.array(train)\n", " test = np.array(test)\n", " \n", " return train,test" ] }, { "cell_type": "code", "execution_count": 164, "metadata": {}, "outputs": [], "source": [ "batch_size = config['batch_size']\n", "test_batch_size = config['test_batch_size']" ] }, { "cell_type": "code", "execution_count": 165, "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 20/20 [00:15<00:00, 1.29it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "algo: output/emb__cluster_id__StarNode.out epochs: 20, micro f1: 0.9093110871905274, macro f1:0.9094875754311472\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "100%|██████████| 20/20 [00:15<00:00, 1.33it/s]" ] }, { "name": "stdout", "output_type": "stream", "text": [ "algo: output/emb__CliqueNode__CliqueNode.out epochs: 20, micro f1: 0.9171151776103337, macro f1:0.9169262311726959\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "\n" ] } ], "source": [ "for algo in config['input_embeddings']:\n", " embeddings = read_embeddings(algo)\n", " train,test = read_train_test(embeddings)\n", " \n", " y_train = train[:, 1]\n", " y_test = test[:, 1]\n", "\n", " clf = SGDClassifier(random_state=0, loss='log', alpha=0.0001)\n", " for e in tqdm(range(0, max(config['epochs']))):\n", " for idx in range(0,train.shape[0],batch_size):\n", " ex=train[idx:min(idx+batch_size,train.shape[0]),:]\n", "\n", " ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n", " ex_y = y_train[idx:min(idx+batch_size,train.shape[0])]\n", " \n", " clf.partial_fit(ex_emb_in, ex_y, classes=[0,1,2,3])\n", " \n", " if e+1 in config['epochs']:\n", " acc = 0.0\n", " y_pred = []\n", " for n, idx in enumerate(range(0,test.shape[0],test_batch_size)):\n", " ex=test[idx:min(idx+test_batch_size,train.shape[0]),:]\n", " ex_emb_in = embeddings.loc[ex[:,0]].to_numpy()\n", " pred = clf.predict_proba(ex_emb_in)\n", " \n", " classes = np.argmax(pred, axis=1)\n", " y_pred.extend(classes)\n", "\n", " f1_micro = f1_score(y_test, y_pred, average='micro')\n", " f1_macro = f1_score(y_test, y_pred, average='macro')\n", " print('algo: {} epochs: {}, micro f1: {}, macro f1:{}'.format(algo, e+1, f1_micro, f1_macro))\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.4" } }, "nbformat": 4, "nbformat_minor": 4 }