.. _sec_synonyms:
Word Similarity and Analogy
===========================
In :numref:`sec_word2vec_pretraining`, we trained a word2vec model on
a small dataset, and applied it to find semantically similar words for
an input word. In practice, word vectors that are pretrained on large
corpora can be applied to downstream natural language processing tasks,
which will be covered later in :numref:`chap_nlp_app`. To demonstrate
semantics of pretrained word vectors from large corpora in a
straightforward way, let us apply them in the word similarity and
analogy tasks.
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    import os
    from mxnet import np, npx
    from d2l import mxnet as d2l
    
    npx.set_np()
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    import os
    import torch
    from torch import nn
    from d2l import torch as d2l
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    #@save
    d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip',
                                    '0b8703943ccdb6eb788e6f091b8946e82231bc4d')
    
    #@save
    d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip',
                                     'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')
    
    #@save
    d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip',
                                      'b5116e234e9eb9076672cfeabf5469f3eec904fa')
    
    #@save
    d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip',
                               'c1816da3821ae9f43899be655002f6c723e91b88')
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    #@save
    d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + 'glove.6B.50d.zip',
                                    '0b8703943ccdb6eb788e6f091b8946e82231bc4d')
    
    #@save
    d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + 'glove.6B.100d.zip',
                                     'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a')
    
    #@save
    d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + 'glove.42B.300d.zip',
                                      'b5116e234e9eb9076672cfeabf5469f3eec904fa')
    
    #@save
    d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + 'wiki.en.zip',
                               'c1816da3821ae9f43899be655002f6c723e91b88')
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    #@save
    class TokenEmbedding:
        """Token Embedding."""
        def __init__(self, embedding_name):
            self.idx_to_token, self.idx_to_vec = self._load_embedding(
                embedding_name)
            self.unknown_idx = 0
            self.token_to_idx = {token: idx for idx, token in
                                 enumerate(self.idx_to_token)}
    
        def _load_embedding(self, embedding_name):
            idx_to_token, idx_to_vec = [''], []
            data_dir = d2l.download_extract(embedding_name)
            # GloVe website: https://nlp.stanford.edu/projects/glove/
            # fastText website: https://fasttext.cc/
            with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
                for line in f:
                    elems = line.rstrip().split(' ')
                    token, elems = elems[0], [float(elem) for elem in elems[1:]]
                    # Skip header information, such as the top row in fastText
                    if len(elems) > 1:
                        idx_to_token.append(token)
                        idx_to_vec.append(elems)
            idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
            return idx_to_token, np.array(idx_to_vec)
    
        def __getitem__(self, tokens):
            indices = [self.token_to_idx.get(token, self.unknown_idx)
                       for token in tokens]
            vecs = self.idx_to_vec[np.array(indices)]
            return vecs
    
        def __len__(self):
            return len(self.idx_to_token)
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    #@save
    class TokenEmbedding:
        """Token Embedding."""
        def __init__(self, embedding_name):
            self.idx_to_token, self.idx_to_vec = self._load_embedding(
                embedding_name)
            self.unknown_idx = 0
            self.token_to_idx = {token: idx for idx, token in
                                 enumerate(self.idx_to_token)}
    
        def _load_embedding(self, embedding_name):
            idx_to_token, idx_to_vec = [''], []
            data_dir = d2l.download_extract(embedding_name)
            # GloVe website: https://nlp.stanford.edu/projects/glove/
            # fastText website: https://fasttext.cc/
            with open(os.path.join(data_dir, 'vec.txt'), 'r') as f:
                for line in f:
                    elems = line.rstrip().split(' ')
                    token, elems = elems[0], [float(elem) for elem in elems[1:]]
                    # Skip header information, such as the top row in fastText
                    if len(elems) > 1:
                        idx_to_token.append(token)
                        idx_to_vec.append(elems)
            idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec
            return idx_to_token, torch.tensor(idx_to_vec)
    
        def __getitem__(self, tokens):
            indices = [self.token_to_idx.get(token, self.unknown_idx)
                       for token in tokens]
            vecs = self.idx_to_vec[torch.tensor(indices)]
            return vecs
    
        def __len__(self):
            return len(self.idx_to_token)
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    glove_6b50d = TokenEmbedding('glove.6b.50d')
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    Downloading ../data/glove.6B.50d.zip from http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.50d.zip...
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    glove_6b50d = TokenEmbedding('glove.6b.50d')
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    Downloading ../data/glove.6B.50d.zip from http://d2l-data.s3-accelerate.amazonaws.com/glove.6B.50d.zip...
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    len(glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    400001
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    len(glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    400001
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    glove_6b50d.token_to_idx['beautiful'], glove_6b50d.idx_to_token[3367]
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    (3367, 'beautiful')
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    glove_6b50d.token_to_idx['beautiful'], glove_6b50d.idx_to_token[3367]
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    (3367, 'beautiful')
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def knn(W, x, k):
        # Add 1e-9 for numerical stability
        cos = np.dot(W, x.reshape(-1,)) / (
            np.sqrt(np.sum(W * W, axis=1) + 1e-9) * np.sqrt((x * x).sum()))
        topk = npx.topk(cos, k=k, ret_typ='indices')
        return topk, [cos[int(i)] for i in topk]
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def knn(W, x, k):
        # Add 1e-9 for numerical stability
        cos = torch.mv(W, x.reshape(-1,)) / (
            torch.sqrt(torch.sum(W * W, axis=1) + 1e-9) *
            torch.sqrt((x * x).sum()))
        _, topk = torch.topk(cos, k=k)
        return topk, [cos[int(i)] for i in topk]
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def get_similar_tokens(query_token, k, embed):
        topk, cos = knn(embed.idx_to_vec, embed[[query_token]], k + 1)
        for i, c in zip(topk[1:], cos[1:]):  # Exclude the input word
            print(f'cosine sim={float(c):.3f}: {embed.idx_to_token[int(i)]}')
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def get_similar_tokens(query_token, k, embed):
        topk, cos = knn(embed.idx_to_vec, embed[[query_token]], k + 1)
        for i, c in zip(topk[1:], cos[1:]):  # Exclude the input word
            print(f'cosine sim={float(c):.3f}: {embed.idx_to_token[int(i)]}')
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('chip', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.856: chips
    cosine sim=0.749: intel
    cosine sim=0.749: electronics
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('chip', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.856: chips
    cosine sim=0.749: intel
    cosine sim=0.749: electronics
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('baby', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.839: babies
    cosine sim=0.800: boy
    cosine sim=0.792: girl
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('beautiful', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.921: lovely
    cosine sim=0.893: gorgeous
    cosine sim=0.830: wonderful
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('baby', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.839: babies
    cosine sim=0.800: boy
    cosine sim=0.792: girl
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_similar_tokens('beautiful', 3, glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    cosine sim=0.921: lovely
    cosine sim=0.893: gorgeous
    cosine sim=0.830: wonderful
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def get_analogy(token_a, token_b, token_c, embed):
        vecs = embed[[token_a, token_b, token_c]]
        x = vecs[1] - vecs[0] + vecs[2]
        topk, cos = knn(embed.idx_to_vec, x, 1)
        return embed.idx_to_token[int(topk[0])]  # Remove unknown words
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    def get_analogy(token_a, token_b, token_c, embed):
        vecs = embed[[token_a, token_b, token_c]]
        x = vecs[1] - vecs[0] + vecs[2]
        topk, cos = knn(embed.idx_to_vec, x, 1)
        return embed.idx_to_token[int(topk[0])]  # Remove unknown words
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('man', 'woman', 'son', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'daughter'
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('man', 'woman', 'son', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'daughter'
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('beijing', 'china', 'tokyo', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'japan'
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('beijing', 'china', 'tokyo', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'japan'
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('bad', 'worst', 'big', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'biggest'
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('bad', 'worst', 'big', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'biggest'
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('do', 'did', 'go', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'went'
.. raw:: html
     
.. raw:: html
     
.. raw:: latex
   \diilbookstyleinputcell
.. code:: python
    get_analogy('do', 'did', 'go', glove_6b50d)
.. raw:: latex
   \diilbookstyleoutputcell
.. parsed-literal::
    :class: output
    'went'
.. raw:: html
     
.. raw:: html
     
.. raw:: html
     
`Discussions `__
.. raw:: html
     
.. raw:: html
     
`Discussions `__
.. raw:: html
     
.. raw:: html