.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
'090b5e7e70c295757f55df93cb0a180b9691891a')
def read_time_machine(): #@save
"""Load the time machine dataset into a list of text lines."""
with open(d2l.download('time_machine'), 'r') as f:
lines = f.readlines()
return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
lines = read_time_machine()
print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
'090b5e7e70c295757f55df93cb0a180b9691891a')
def read_time_machine(): #@save
"""Load the time machine dataset into a list of text lines."""
with open(d2l.download('time_machine'), 'r') as f:
lines = f.readlines()
return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
lines = read_time_machine()
print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
#@save
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt',
'090b5e7e70c295757f55df93cb0a180b9691891a')
def read_time_machine(): #@save
"""Load the time machine dataset into a list of text lines."""
with open(d2l.download('time_machine'), 'r') as f:
lines = f.readlines()
return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
lines = read_time_machine()
print(f'# text lines: {len(lines)}')
print(lines[0])
print(lines[10])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
# text lines: 3221
the time machine by h g wells
twinkled and his usually pale face was flushed and animated the
.. raw:: html
.. raw:: html
Tokenization
------------
The following ``tokenize`` function takes a list (``lines``) as the
input, where each element is a text sequence (e.g., a text line). Each
text sequence is split into a list of tokens. A *token* is the basic
unit in text. In the end, a list of token lists are returned, where each
token is a string.
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
def tokenize(lines, token='word'): #@save
"""Split text lines into word or character tokens."""
if token == 'word':
return [line.split() for line in lines]
elif token == 'char':
return [list(line) for line in lines]
else:
print('ERROR: unknown token type: ' + token)
tokens = tokenize(lines)
for i in range(11):
print(tokens[i])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
def tokenize(lines, token='word'): #@save
"""Split text lines into word or character tokens."""
if token == 'word':
return [line.split() for line in lines]
elif token == 'char':
return [list(line) for line in lines]
else:
print('ERROR: unknown token type: ' + token)
tokens = tokenize(lines)
for i in range(11):
print(tokens[i])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
.. raw:: html
.. raw:: html
.. raw:: latex
\diilbookstyleinputcell
.. code:: python
def tokenize(lines, token='word'): #@save
"""Split text lines into word or character tokens."""
if token == 'word':
return [line.split() for line in lines]
elif token == 'char':
return [list(line) for line in lines]
else:
print('ERROR: unknown token type: ' + token)
tokens = tokenize(lines)
for i in range(11):
print(tokens[i])
.. raw:: latex
\diilbookstyleoutputcell
.. parsed-literal::
:class: output
['the', 'time', 'machine', 'by', 'h', 'g', 'wells']
[]
[]
[]
[]
['i']
[]
[]
['the', 'time', 'traveller', 'for', 'so', 'it', 'will', 'be', 'convenient', 'to', 'speak', 'of', 'him']
['was', 'expounding', 'a', 'recondite', 'matter', 'to', 'us', 'his', 'grey', 'eyes', 'shone', 'and']
['twinkled', 'and', 'his', 'usually', 'pale', 'face', 'was', 'flushed', 'and', 'animated', 'the']
.. raw:: html
.. raw:: html
Vocabulary
----------
The string type of the token is inconvenient to be used by models, which
take numerical inputs. Now let us build a dictionary, often called
*vocabulary* as well, to map string tokens into numerical indices
starting from 0. To do so, we first count the unique tokens in all the
documents from the training set, namely a *corpus*, and then assign a
numerical index to each unique token according to its frequency. Rarely
appeared tokens are often removed to reduce the complexity. Any token
that does not exist in the corpus or has been removed is mapped into a
special unknown token “