{ "cells": [ { "cell_type": "code", "execution_count": 145, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import SGDClassifier\n", "from sklearn.utils import shuffle\n", "from tqdm import tqdm\n", "import pickle as pkl\n", "import pandas as pd\n", "import random\n", "import sys\n", "import os\n", "from sklearn.metrics import f1_score" ] }, { "cell_type": "code", "execution_count": 146, "metadata": {}, "outputs": [], "source": [ "random.seed(0)\n", "np.random.seed(0)" ] }, { "cell_type": "code", "execution_count": 147, "metadata": {}, "outputs": [], "source": [ "config = {\n", " #embedding computation\n", " 'cleora_n_iter': 5,\n", " 'cleora_dim': 1024,\n", " \n", " #dataset preparation\n", " 'train_test_split': 0.2,\n", " \n", " #training classification\n", " 'input_embeddings': [\n", " 'output/emb__cluster_id__StarNode.out',\n", " 'output/emb__CliqueNode__CliqueNode.out',\n", " ],\n", " 'batch_size': 256,\n", " 'test_batch_size': 1000,\n", " 'epochs': [20],\n", "}" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Dataset preparation" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "1. Download the Facebook dataset from SNAP: https://snap.stanford.edu/data/facebook-large-page-page-network.html\n", "2. Extract the dataset to ./facebook_large/\n", "3. Compute Cleora embeddings as shown in \"Cleora training\" section in `example_link_prediction.ipynb`" ] }, { "cell_type": "code", "execution_count": 148, "metadata": {}, "outputs": [], "source": [ "df_cleora = pd.read_csv(\"./facebook_large/musae_facebook_edges.csv\")" ] }, { "cell_type": "code", "execution_count": 149, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | id_1 | \n", "id_2 | \n", "
---|---|---|
0 | \n", "0 | \n", "18427 | \n", "
1 | \n", "1 | \n", "21708 | \n", "
2 | \n", "1 | \n", "22208 | \n", "
3 | \n", "1 | \n", "22171 | \n", "
4 | \n", "1 | \n", "6829 | \n", "