In [None]:
from tensorflow.python.client import device_lib
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 11186713555381752508]

In [None]:
!cat /proc/meminfo

MemTotal: 13302928 kB
MemFree: 10445988 kB
MemAvailable: 12396088 kB
Buffers: 125744 kB
Cached: 1951416 kB
SwapCached: 0 kB
Active: 1102596 kB
Inactive: 1504304 kB
Active(anon): 487052 kB
Inactive(anon): 424 kB
Active(file): 615544 kB
Inactive(file): 1503880 kB
Unevictable: 0 kB
Mlocked: 0 kB
SwapTotal: 0 kB
SwapFree: 0 kB
Dirty: 16096 kB
Writeback: 0 kB
AnonPages: 529792 kB
Mapped: 376976 kB
Shmem: 1136 kB
KReclaimable: 144304 kB
Slab: 189052 kB
SReclaimable: 144304 kB
SUnreclaim: 44748 kB
KernelStack: 4800 kB
PageTables: 6728 kB
NFS_Unstable: 0 kB
Bounce: 0 kB
WritebackTmp: 0 kB
CommitLimit: 6651464 kB
Committed_AS: 3166456 kB
VmallocTotal: 34359738367 kB
VmallocUsed: 7172 kB
VmallocChunk: 0 kB
Percpu: 1400 kB
AnonHugePages: 0 kB
ShmemHugePages: 0 kB
ShmemPmdMapped: 0 kB
FileHugePages: 0 kB
FilePmdMapped: 0 kB
CmaTotal: 0 kB
CmaFree: 0 kB
HugePages_Total: 0
HugePages_Free: 0
HugePages_Rsvd: 0
HugePages_Surp: 0
Hugepagesize: 2048 kB
Hugetlb: 0 kB
DirectMap4k: 95040 kB
DirectMap2M: 514

In [None]:
import pandas as pd
import sklearn.metrics
import numpy as np
import random

In [None]:
from pathlib import Path

# set path in which you store data to embeddings during Cleora training
PATH = str(Path.home())

# set PATH for Dunnhumby Journey dataset
DUNNHUMBY_JOURNEY_PATH = PATH + "dunnhumby_The-Complete-Journey/"

# set PATH for output files 
OUTPUT_PATH = PATH +'output/'

In [None]:
random.seed(0)
np.random.seed(0)

**Dataset preparation**

1. Download the Dunnhumby The Complete Journey from official repository
2. Unpack the .zip file to the previously defined location



In [None]:
!wget https://www.dunnhumby.com/wp-content/uploads/sourcefiles/dunnhumby_The-Complete-Journey.zip -P $PATH

--2021-08-04 17:23:06-- https://www.dunnhumby.com/wp-content/uploads/sourcefiles/dunnhumby_The-Complete-Journey.zip
Resolving www.dunnhumby.com (www.dunnhumby.com)... 13.107.246.67, 13.107.213.67, 2620:1ec:bdf::67, ...
Connecting to www.dunnhumby.com (www.dunnhumby.com)|13.107.246.67|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 307038175 (293M) [application/zip]
Saving to: ‘/content/drive/MyDrive/cleora_training/dunnhumby/dunnhumby_The-Complete-Journey.zip’


2021-08-04 17:23:22 (19.7 MB/s) - ‘/content/drive/MyDrive/cleora_training/dunnhumby/dunnhumby_The-Complete-Journey.zip’ saved [307038175/307038175]



In [None]:
!unzip -d $PATH -o -q $PATH/dunnhumby_The-Complete-Journey.zip

**Installation of the Synerise Cleora platform**

1. Download the binary file of the latest released version of Cleory from the official Synerise repository. We are using version 1.1.1 in this notebook. You can find earlier and newer versions [here](https://github.com/Synerise/cleora/releases/).
2. Set the execute permissions of the previously downloaded Cleora binary file.



In [None]:
!wget https://github.com/Synerise/cleora/releases/download/v1.1.1/cleora-v1.1.1-x86_64-unknown-linux-gnu -P $PATH 

--2021-08-04 17:23:53-- https://github.com/Synerise/cleora/releases/download/v1.1.1/cleora-v1.1.1-x86_64-unknown-linux-gnu
Resolving github.com (github.com)... 140.82.121.3
Connecting to github.com (github.com)|140.82.121.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-releases.githubusercontent.com/310368642/0a02ca00-b4b4-11eb-8258-5b862e921e51?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20210804%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20210804T172353Z&X-Amz-Expires=300&X-Amz-Signature=5db708262725b021625bce84adcd37a7b0cdcca249bcc15a4c7b9442a40a2eb8&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=310368642&response-content-disposition=attachment%3B%20filename%3Dcleora-v1.1.1-x86_64-unknown-linux-gnu&response-content-type=application%2Foctet-stream [following]
--2021-08-04 17:23:53-- https://github-releases.githubusercontent.com/310368642/0a02ca00-b4b4-11eb-8258-5b862e921e51?X-Amz-Algorithm=AWS4-HMAC-S

In [None]:
!chmod +x $PATH/cleora-v1.1.1-x86_64-unknown-linux-gnu

In [None]:
df = pd.read_csv(DUNNHUMBY_JOURNEY_PATH+"dunnhumby - The Complete Journey CSV/transaction_data.csv", delimiter = ",")
products = pd.read_csv(DUNNHUMBY_JOURNEY_PATH+"dunnhumby - The Complete Journey CSV/product.csv", delimiter = ",", index_col=0)

In [None]:
df_grouped = df.groupby('BASKET_ID')

**Clique expansion**

1. The **dunnhumby_cleora_input.txt** file has the structure of adjacency list. 

In [None]:
cleora_f_dict = open(PATH+'dunnhumby_cleora_input.txt', 'w') 
for name, group in df_grouped:
 product_ids = [str(p) for p in group['PRODUCT_ID'].tolist()]
 cleora_f_dict.write('{}\n'.format(' '.join(product_ids)))
cleora_f_dict.close() 

**Setting configuration before Train Cleora.**

1. cleora_n_iter 
2. cleora_dim
3. columns

and other if you want. More about configuration params you can find [here.](https://cleora.readthedocs.io/en/latest/running.html#run-options)


In [None]:
config = {
 #embedding computation
 'cleora_n_iter': 4,
 'cleora_dim': 1024,
 'columns' : 'complex::reflexive::CliqueNode'
}

The parameter -c "**complex::reflexive::CliqueNode**" means that edges will be created for all cominations of nodes from each line. 
This translates to clique expansion scheme.

* You can read more about **Clique Node** and other expansion methods you can read [here.](https://cleora.readthedocs.io/en/latest/algorithms.html#clique-expansion)



In [None]:
import subprocess

def train_cleora(dim, n_iter, columns, input_filename):
 command = [PATH+'cleora-v1.1.1-x86_64-unknown-linux-gnu',
 '--columns', columns,
 '--dimension', str(dim), 
 '-n', str(n_iter), 
 '--input', input_filename,
 '-o',OUTPUT_PATH,
 '-r', f'iterations_{n_iter}_'
 ]
 subprocess.run(command, check=True)

In [None]:
%%time
train_cleora(config['cleora_dim'], config['cleora_n_iter'], config['columns'], PATH+'dunnhumby_cleora_input.txt')

CPU times: user 389 ms, sys: 52.7 ms, total: 442 ms
Wall time: 1min 44s


'/content/drive/MyDrive/cleora_training/dunnhumby/output/emb__CliqueNode__CliqueNode__iter_1.vec'

**Prediction**

In [None]:
prods_for_ar = []

for name, group in df_grouped:
 product_ids = [str(p) for p in group['PRODUCT_ID'].tolist()]
 prods_for_ar.append(product_ids)

In [None]:
vects_iter = np.loadtxt(OUTPUT_PATH+"__CliqueNode__CliqueNode.out", delimiter = " ", skiprows=1, usecols=[0]+list(range(2,1026)))

In [None]:
ids_iter = vects_iter[:, 0]
vects_iter = vects_iter[:, 1:]

In [None]:
dist = sklearn.metrics.pairwise.cosine_similarity(vects_iter[0].reshape(1, -1), vects_iter, dense_output=True)
ranking = (-dist).argsort()

In [None]:
for idx, v in enumerate(vects_iter):
 print('-------------base product:')
 curr_prod_id = ids_iter[idx]
 print(curr_prod_id)
 curr_product = products.loc[curr_prod_id]

 print(curr_product['COMMODITY_DESC'], curr_product['SUB_COMMODITY_DESC'], curr_product['CURR_SIZE_OF_PRODUCT'])
 dist = sklearn.metrics.pairwise.cosine_similarity(v.reshape(1, -1), vects_iter, dense_output=True)
 ranking = (-dist).argsort()[0]
 for r in ranking[:10]:
 suggested_prod_id = int(ids_iter[r])
 suggested_prod = products.loc[suggested_prod_id]
 print('suggested: ', suggested_prod_id, suggested_prod['COMMODITY_DESC'], suggested_prod['SUB_COMMODITY_DESC'], suggested_prod['CURR_SIZE_OF_PRODUCT'])


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
suggested: 16769752 PROCESSED DIPS 16 OZ
suggested: 915257 BEEF LOIN - STK/CHP/SLC 
suggested: 863447 BEEF CHOICE BEEF 
suggested: 12263300 BLEACH ALL FABRIC DRY BLEACH 1.5 LB
suggested: 955765 PROCESSED PROCESSED OTHER 4.5 OZ
-------------base product:
5564922.0
CRACKERS/MISC BKD FD CHEESE CRACKERS (CHEEZ-ITS/GOL 7.2 OZ
suggested: 5564922 CRACKERS/MISC BKD FD CHEESE CRACKERS (CHEEZ-ITS/GOL 7.2 OZ
suggested: 832312 IN-STORE PHOTOFINISHING OVERNIGHT PROCESSING 
suggested: 10285167 AUDIO/VIDEO PRODUCTS DVD S 
suggested: 7167711 ICE CREAM/MILK/SHERBTS PREMIUM 1/2 GAL
suggested: 1096449 LIQUOR LIQUEURS/SPECIALTIES (42 UNDER 750 ML
suggested: 15778515 PET CARE SUPPLIES DOG & CAT CHEMICALS 
suggested: 9297058 NO COMMODITY DESCRIPTION NO SUBCOMMODITY DESCRIPTION 
suggested: 7168701 STATIONERY & SCHOOL SUPPLIES CHILDREN S ACTIVITY 6 CT
suggested: 9445751 DIETARY AID PRODUCTS LIQUID NUTRITION FOR ADULTS 6PK/8OZ
suggested: 5568152 