# Document embeddings

In [1]:
import urllib.request
import os

def download_file(url,local_file, force=False):
    """
    Helper function to download a file and store it locally
    """
    if not os.path.exists(local_file) or force:
        print('Downloading')
        with urllib.request.urlopen(url) as opener, \
             open(local_file, mode='wb') as outfile:
                    outfile.write(opener.read())
        print('done')
    else:
        print('Already downloaded')

The data we are working on are reviews of restaurants in Pisa.
The file is in CSV format, with the name of the restaurant in the third column and the review in the fourth column.

In [2]:
import re

review_file = 'restaurants_pisa.csv'
download_file('https://goo.gl/1SYVBt', review_file)

with open(review_file, mode='r', encoding='utf-8') as infile:
    print(next(infile)) # printing the first line

Already downloaded
118184055,pisa_res\it\1184195\2046160\118184055.html,Ristorante Il Granaio,"Il ristorante  veramente tipico e in zona uno dei migliori in assoluto per mangiare la carne. Abbiamo fatto un addio al celibato e mangiato davvero come lupi senza lesinare su niente...come recitato sul loro sito ogni pietanza  di produzione locale, soprattutto il &quot;mucco pisano&quot; questa razza bovina recuperata dall'universit. Posso dire di non aver mai mangiato cos tanto e bene in un ristorante, ma neanche di aver speso cos tanto! In totale 62, per quanto tutto d'eccellenza, sono stati davvero troppi!",3,negative



We load the file using the functions from the csv module, and put it in the format specified in gensim documentation at https://radimrehurek.com/gensim/models/doc2vec.html

In [3]:
import csv
import re
from gensim.models.doc2vec import TaggedDocument
data = list()
with open(review_file, encoding='utf-8', newline='') as infile:
	reader = csv.reader(infile)
	for row in reader:
		data.append(TaggedDocument(re.split('[\W\d_]+',row[3].lower()),[row[2]]))
data

[TaggedDocument(words=['il', 'ristorante', 'veramente', 'tipico', 'e', 'in', 'zona', 'uno', 'dei', 'migliori', 'in', 'assoluto', 'per', 'mangiare', 'la', 'carne', 'abbiamo', 'fatto', 'un', 'addio', 'al', 'celibato', 'e', 'mangiato', 'davvero', 'come', 'lupi', 'senza', 'lesinare', 'su', 'niente', 'come', 'recitato', 'sul', 'loro', 'sito', 'ogni', 'pietanza', 'di', 'produzione', 'locale', 'soprattutto', 'il', 'quot', 'mucco', 'pisano', 'quot', 'questa', 'razza', 'bovina', 'recuperata', 'dall', 'universit', 'posso', 'dire', 'di', 'non', 'aver', 'mai', 'mangiato', 'cos', 'tanto', 'e', 'bene', 'in', 'un', 'ristorante', 'ma', 'neanche', 'di', 'aver', 'speso', 'cos', 'tanto', 'in', 'totale', 'per', 'quanto', 'tutto', 'd', 'eccellenza', 'sono', 'stati', 'davvero', 'troppi', ''], tags=['Ristorante Il Granaio']),
 TaggedDocument(words=['osteria', 'tipicissima', 'a', 'coltano', 'nella', 'campagna', 'tra', 'pisa', 'e', 'livorno', 'un', 'pochino', 'difficile', 'da', 'raggiungere', 'se', 'non', 'sai

In [4]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

Bulding the Doc2Vec model is very similar to building a Word2Vec one.

In [5]:
from gensim.models import Doc2Vec
model = Doc2Vec(data, size=100, window=10, min_count=5)

2019-06-07 12:03:32,288 : INFO : collecting all words and their counts
2019-06-07 12:03:32,288 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-06-07 12:03:32,471 : INFO : PROGRESS: at example #10000, processed 686075 words (3771972/s), 23170 word types, 106 tags
2019-06-07 12:03:32,649 : INFO : PROGRESS: at example #20000, processed 1367665 words (3844927/s), 32429 word types, 148 tags
2019-06-07 12:03:32,825 : INFO : PROGRESS: at example #30000, processed 2045428 words (3859293/s), 39709 word types, 216 tags
2019-06-07 12:03:32,998 : INFO : PROGRESS: at example #40000, processed 2691283 words (3743334/s), 45569 word types, 309 tags
2019-06-07 12:03:33,172 : INFO : PROGRESS: at example #50000, processed 3345779 words (3767755/s), 50534 word types, 480 tags
2019-06-07 12:03:33,350 : INFO : PROGRESS: at example #60000, processed 4010160 words (3764499/s), 55427 word types, 612 tags
2019-06-07 12:03:33,528 : INFO : PROGRESS: at example #70000, processe

The training produces word emeddings, which can be queried in the same way of the word2vec model

In [6]:
model.wv.most_similar(['pizza'])

2019-06-07 12:05:07,013 : INFO : precomputing L2-norms of word weight vectors


[('piadina', 0.6829147934913635),
 ('cecina', 0.655540943145752),
 ('pasta', 0.628814697265625),
 ('tagliata', 0.585970401763916),
 ('focaccina', 0.5721102356910706),
 ('fiorentina', 0.5518332719802856),
 ('focaccia', 0.5483949184417725),
 ('pizze', 0.5461304187774658),
 ('paella', 0.5445587635040283),
 ('farinata', 0.5411989688873291)]

In [7]:
model.wv.most_similar(['conto'])

[('costo', 0.750023365020752),
 ('prezzo', 0.7097448110580444),
 ('spesa', 0.5757821798324585),
 ('risultato', 0.5519887208938599),
 ('ricarico', 0.5369582772254944),
 ('digestivo', 0.4652857780456543),
 ('pagare', 0.4639531970024109),
 ('limoncello', 0.4600844383239746),
 ('liquorino', 0.4571288824081421),
 ('ponche', 0.45086413621902466)]

In [8]:
model.wv.most_similar(['cameriere'])

[('proprietario', 0.8149348497390747),
 ('signore', 0.7819277048110962),
 ('titolare', 0.7665988206863403),
 ('gestore', 0.7607924938201904),
 ('cameriera', 0.7500070333480835),
 ('personale', 0.7032696008682251),
 ('signora', 0.6522108316421509),
 ('propietario', 0.6041712164878845),
 ('ragazzo', 0.5915075540542603),
 ('cuoco', 0.5801193714141846)]

Document embeddings are stored in model.docvecs

In [9]:
model.docvecs.doctags

{'Ristorante Il Granaio': Doctag(offset=0, word_count=42319, doc_count=573),
 'La Taverna del Grillo': Doctag(offset=1, word_count=9561, doc_count=152),
 'Ristoro il giardino nascosto': Doctag(offset=2, word_count=17516, doc_count=224),
 'Ristoro Lago Le Tamerici': Doctag(offset=3, word_count=3800, doc_count=57),
 'Ristoro Re di Puglia': Doctag(offset=4, word_count=32768, doc_count=395),
 'Circolo Arci': Doctag(offset=5, word_count=490, doc_count=8),
 'Gelateria Tuffo 13': Doctag(offset=6, word_count=1226, doc_count=26),
 'Lo Spaventapasseri': Doctag(offset=7, word_count=640, doc_count=13),
 'Pisa Da Pizzi &amp; Co': Doctag(offset=8, word_count=163, doc_count=3),
 'Azzurro': Doctag(offset=9, word_count=2544, doc_count=42),
 'I Giardini del Colombre': Doctag(offset=10, word_count=1938, doc_count=32),
 'Pani e Tulipani': Doctag(offset=11, word_count=145, doc_count=2),
 'Caff Siena': Doctag(offset=12, word_count=187, doc_count=4),
 'Galileo Art Caf': Doctag(offset=13, word_count=2178, doc

The most_similar method in model.docvecs computes similarity among document embeddings

In [10]:
model.docvecs.most_similar("Il Campano")

2019-06-07 12:05:52,330 : INFO : precomputing L2-norms of doc weight vectors


[('Ristorante Il Granaio', 0.8523026704788208),
 ('Osteria in domo', 0.8299446702003479),
 ('La Taverna di Emma', 0.7787537574768066),
 ('Ristorante Enoteca IL TOSCANO', 0.75307297706604),
 ('Trattoria S. Omobono', 0.7454063296318054),
 ('Osteria di Culegna', 0.7407934665679932),
 ('La Grotta', 0.7357390522956848),
 ('Ristoro Re di Puglia', 0.7348884344100952),
 ('Ristorante Il Colonnino', 0.7283028960227966),
 ('Osteria dei Mille', 0.7281427383422852)]

Document embeddings can be also used to search similar embeddings among word embeddings

In [12]:
model.wv.most_similar([model.docvecs["Ristoro Re di Puglia"]])

[('bisonte', 0.5957494378089905),
 ('salamella', 0.5739859938621521),
 ('oberdan', 0.5711488723754883),
 ('trifolati', 0.5537594556808472),
 ('provole', 0.5449040532112122),
 ('struzzo', 0.5408831834793091),
 ('presentando', 0.540260374546051),
 ('rimessaggio', 0.5396377444267273),
 ('pacinotti', 0.5306090116500854),
 ('ospedaletto', 0.5286394357681274)]