Source code for d2l.mxnet
Open the notebook in Colab
Open the notebook in Colab
Open the notebook in Colab


DATA_HUB = dict()

from mxnet import autograd, context, gluon, image, init, np, npx
from mxnet.gluon import nn, rnn
from import transforms

nn_Module = nn.Block

#################   WARNING   ################
# The below part is generated automatically through:
#    d2lbook build lib
# Don't edit it directly

import collections
import hashlib
import math
import os
import random
import re
import shutil
import sys
import tarfile
import time
import zipfile
from collections import defaultdict
import pandas as pd
import requests
from IPython import display
from matplotlib import pyplot as plt
from matplotlib_inline import backend_inline

d2l = sys.modules[__name__]

from mxnet import autograd, context, gluon, image, init, np, npx
from mxnet.gluon import nn, rnn

[docs]def use_svg_display(): """Use the svg format to display a plot in Jupyter. Defined in :numref:`sec_calculus`""" backend_inline.set_matplotlib_formats('svg')
[docs]def set_figsize(figsize=(3.5, 2.5)): """Set the figure size for matplotlib. Defined in :numref:`sec_calculus`""" use_svg_display() d2l.plt.rcParams['figure.figsize'] = figsize
[docs]def set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend): """Set the axes for matplotlib. Defined in :numref:`sec_calculus`""" axes.set_xlabel(xlabel) axes.set_ylabel(ylabel) axes.set_xscale(xscale) axes.set_yscale(yscale) axes.set_xlim(xlim) axes.set_ylim(ylim) if legend: axes.legend(legend) axes.grid()
[docs]def plot(X, Y=None, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), figsize=(3.5, 2.5), axes=None): """Plot data points. Defined in :numref:`sec_calculus`""" if legend is None: legend = [] set_figsize(figsize) axes = axes if axes else d2l.plt.gca() # Return True if `X` (tensor or list) has 1 axis def has_one_axis(X): return (hasattr(X, "ndim") and X.ndim == 1 or isinstance(X, list) and not hasattr(X[0], "__len__")) if has_one_axis(X): X = [X] if Y is None: X, Y = [[]] * len(X), X elif has_one_axis(Y): Y = [Y] if len(X) != len(Y): X = X * len(Y) axes.cla() for x, y, fmt in zip(X, Y, fmts): if len(x): axes.plot(x, y, fmt) else: axes.plot(y, fmt) set_axes(axes, xlabel, ylabel, xlim, ylim, xscale, yscale, legend)
[docs]class Timer: """Record multiple running times.""" def __init__(self): """Defined in :numref:`subsec_linear_model`""" self.times = [] self.start()
[docs] def start(self): """Start the timer.""" self.tik = time.time()
[docs] def stop(self): """Stop the timer and record the time in a list.""" self.times.append(time.time() - self.tik) return self.times[-1]
[docs] def avg(self): """Return the average time.""" return sum(self.times) / len(self.times)
[docs] def sum(self): """Return the sum of time.""" return sum(self.times)
[docs] def cumsum(self): """Return the accumulated time.""" return np.array(self.times).cumsum().tolist()
[docs]def synthetic_data(w, b, num_examples): """Generate y = Xw + b + noise. Defined in :numref:`sec_linear_scratch`""" X = d2l.normal(0, 1, (num_examples, len(w))) y = d2l.matmul(X, w) + b y += d2l.normal(0, 0.01, y.shape) return X, d2l.reshape(y, (-1, 1))
[docs]def linreg(X, w, b): """The linear regression model. Defined in :numref:`sec_linear_scratch`""" return d2l.matmul(X, w) + b
[docs]def squared_loss(y_hat, y): """Squared loss. Defined in :numref:`sec_linear_scratch`""" return (y_hat - d2l.reshape(y, y_hat.shape)) ** 2 / 2
[docs]def sgd(params, lr, batch_size): """Minibatch stochastic gradient descent. Defined in :numref:`sec_linear_scratch`""" for param in params: param[:] = param - lr * param.grad / batch_size
[docs]def load_array(data_arrays, batch_size, is_train=True): """Construct a Gluon data iterator. Defined in :numref:`sec_linear_concise`""" dataset =*data_arrays) return, batch_size, shuffle=is_train)
[docs]def get_fashion_mnist_labels(labels): """Return text labels for the Fashion-MNIST dataset. Defined in :numref:`sec_fashion_mnist`""" text_labels = ['t-shirt', 'trouser', 'pullover', 'dress', 'coat', 'sandal', 'shirt', 'sneaker', 'bag', 'ankle boot'] return [text_labels[int(i)] for i in labels]
[docs]def show_images(imgs, num_rows, num_cols, titles=None, scale=1.5): """Plot a list of images. Defined in :numref:`sec_fashion_mnist`""" figsize = (num_cols * scale, num_rows * scale) _, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize) axes = axes.flatten() for i, (ax, img) in enumerate(zip(axes, imgs)): ax.imshow(d2l.numpy(img)) ax.axes.get_xaxis().set_visible(False) ax.axes.get_yaxis().set_visible(False) if titles: ax.set_title(titles[i]) return axes
[docs]def get_dataloader_workers(): """Use 4 processes to read the data except for Windows. Defined in :numref:`sec_fashion_mnist`""" return 0 if sys.platform.startswith('win') else 4
[docs]def load_data_fashion_mnist(batch_size, resize=None): """Download the Fashion-MNIST dataset and then load it into memory. Defined in :numref:`sec_fashion_mnist`""" dataset = trans = [dataset.transforms.ToTensor()] if resize: trans.insert(0, dataset.transforms.Resize(resize)) trans = dataset.transforms.Compose(trans) mnist_train = dataset.FashionMNIST(train=True).transform_first(trans) mnist_test = dataset.FashionMNIST(train=False).transform_first(trans) return (, batch_size, shuffle=True, num_workers=get_dataloader_workers()),, batch_size, shuffle=False, num_workers=get_dataloader_workers()))
[docs]def accuracy(y_hat, y): """Compute the number of correct predictions. Defined in :numref:`sec_softmax_scratch`""" if len(y_hat.shape) > 1 and y_hat.shape[1] > 1: y_hat = d2l.argmax(y_hat, axis=1) cmp = d2l.astype(y_hat, y.dtype) == y return float(d2l.reduce_sum(d2l.astype(cmp, y.dtype)))
[docs]def evaluate_accuracy(net, data_iter): """Compute the accuracy for a model on a dataset. Defined in :numref:`sec_softmax_scratch`""" metric = Accumulator(2) # No. of correct predictions, no. of predictions for X, y in data_iter: metric.add(accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1]
[docs]class Accumulator: """For accumulating sums over `n` variables.""" def __init__(self, n): """Defined in :numref:`sec_softmax_scratch`""" = [0.0] * n
[docs] def add(self, *args): = [a + float(b) for a, b in zip(, args)]
[docs] def reset(self): = [0.0] * len(
def __getitem__(self, idx): return[idx]
[docs]def train_epoch_ch3(net, train_iter, loss, updater): """Train a model within one epoch (defined in Chapter 3). Defined in :numref:`sec_softmax_scratch`""" # Sum of training loss, sum of training accuracy, no. of examples metric = Accumulator(3) if isinstance(updater, gluon.Trainer): updater = updater.step for X, y in train_iter: # Compute gradients and update parameters with autograd.record(): y_hat = net(X) l = loss(y_hat, y) l.backward() updater(X.shape[0]) metric.add(float(l.sum()), accuracy(y_hat, y), y.size) # Return training loss and training accuracy return metric[0] / metric[2], metric[1] / metric[2]
[docs]class Animator: """For plotting data in animation.""" def __init__(self, xlabel=None, ylabel=None, legend=None, xlim=None, ylim=None, xscale='linear', yscale='linear', fmts=('-', 'm--', 'g-.', 'r:'), nrows=1, ncols=1, figsize=(3.5, 2.5)): """Defined in :numref:`sec_softmax_scratch`""" # Incrementally plot multiple lines if legend is None: legend = [] d2l.use_svg_display() self.fig, self.axes = d2l.plt.subplots(nrows, ncols, figsize=figsize) if nrows * ncols == 1: self.axes = [self.axes, ] # Use a lambda function to capture arguments self.config_axes = lambda: d2l.set_axes( self.axes[0], xlabel, ylabel, xlim, ylim, xscale, yscale, legend) self.X, self.Y, self.fmts = None, None, fmts
[docs] def add(self, x, y): # Add multiple data points into the figure if not hasattr(y, "__len__"): y = [y] n = len(y) if not hasattr(x, "__len__"): x = [x] * n if not self.X: self.X = [[] for _ in range(n)] if not self.Y: self.Y = [[] for _ in range(n)] for i, (a, b) in enumerate(zip(x, y)): if a is not None and b is not None: self.X[i].append(a) self.Y[i].append(b) self.axes[0].cla() for x, y, fmt in zip(self.X, self.Y, self.fmts): self.axes[0].plot(x, y, fmt) self.config_axes() display.display(self.fig) display.clear_output(wait=True)
[docs]def train_ch3(net, train_iter, test_iter, loss, num_epochs, updater): """Train a model (defined in Chapter 3). Defined in :numref:`sec_softmax_scratch`""" animator = Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0.3, 0.9], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): train_metrics = train_epoch_ch3(net, train_iter, loss, updater) test_acc = evaluate_accuracy(net, test_iter) animator.add(epoch + 1, train_metrics + (test_acc,)) train_loss, train_acc = train_metrics assert train_loss < 0.5, train_loss assert train_acc <= 1 and train_acc > 0.7, train_acc assert test_acc <= 1 and test_acc > 0.7, test_acc
[docs]def predict_ch3(net, test_iter, n=6): """Predict labels (defined in Chapter 3). Defined in :numref:`sec_softmax_scratch`""" for X, y in test_iter: break trues = d2l.get_fashion_mnist_labels(y) preds = d2l.get_fashion_mnist_labels(d2l.argmax(net(X), axis=1)) titles = [true +'\n' + pred for true, pred in zip(trues, preds)] d2l.show_images( d2l.reshape(X[0:n], (n, 28, 28)), 1, n, titles=titles[0:n])
[docs]def evaluate_loss(net, data_iter, loss): """Evaluate the loss of a model on the given dataset. Defined in :numref:`sec_model_selection`""" metric = d2l.Accumulator(2) # Sum of losses, no. of examples for X, y in data_iter: l = loss(net(X), y) metric.add(d2l.reduce_sum(l), d2l.size(l)) return metric[0] / metric[1]
DATA_HUB = dict() DATA_URL = ''
[docs]def download(name, cache_dir=os.path.join('..', 'data')): """Download a file inserted into DATA_HUB, return the local filename. Defined in :numref:`sec_kaggle_house`""" assert name in DATA_HUB, f"{name} does not exist in {DATA_HUB}." url, sha1_hash = DATA_HUB[name] os.makedirs(cache_dir, exist_ok=True) fname = os.path.join(cache_dir, url.split('/')[-1]) if os.path.exists(fname): sha1 = hashlib.sha1() with open(fname, 'rb') as f: while True: data = if not data: break sha1.update(data) if sha1.hexdigest() == sha1_hash: return fname # Hit cache print(f'Downloading {fname} from {url}...') r = requests.get(url, stream=True, verify=True) with open(fname, 'wb') as f: f.write(r.content) return fname
[docs]def download_extract(name, folder=None): """Download and extract a zip/tar file. Defined in :numref:`sec_kaggle_house`""" fname = download(name) base_dir = os.path.dirname(fname) data_dir, ext = os.path.splitext(fname) if ext == '.zip': fp = zipfile.ZipFile(fname, 'r') elif ext in ('.tar', '.gz'): fp =, 'r') else: assert False, 'Only zip/tar files can be extracted.' fp.extractall(base_dir) return os.path.join(base_dir, folder) if folder else data_dir
[docs]def download_all(): """Download all files in the DATA_HUB. Defined in :numref:`sec_kaggle_house`""" for name in DATA_HUB: download(name)
DATA_HUB['kaggle_house_train'] = ( DATA_URL + 'kaggle_house_pred_train.csv', '585e9cc93e70b39160e7921475f9bcd7d31219ce') DATA_HUB['kaggle_house_test'] = ( DATA_URL + 'kaggle_house_pred_test.csv', 'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
[docs]def try_gpu(i=0): """Return gpu(i) if exists, otherwise return cpu(). Defined in :numref:`sec_use_gpu`""" return npx.gpu(i) if npx.num_gpus() >= i + 1 else npx.cpu()
[docs]def try_all_gpus(): """Return all available GPUs, or [cpu()] if no GPU exists. Defined in :numref:`sec_use_gpu`""" devices = [npx.gpu(i) for i in range(npx.num_gpus())] return devices if devices else [npx.cpu()]
[docs]def corr2d(X, K): """Compute 2D cross-correlation. Defined in :numref:`sec_conv_layer`""" h, w = K.shape Y = d2l.zeros((X.shape[0] - h + 1, X.shape[1] - w + 1)) for i in range(Y.shape[0]): for j in range(Y.shape[1]): Y[i, j] = d2l.reduce_sum((X[i: i + h, j: j + w] * K)) return Y
[docs]def evaluate_accuracy_gpu(net, data_iter, device=None): """Compute the accuracy for a model on a dataset using a GPU. Defined in :numref:`sec_lenet`""" if not device: # Query the first device where the first parameter is on device = list(net.collect_params().values())[0].list_ctx()[0] # No. of correct predictions, no. of predictions metric = d2l.Accumulator(2) for X, y in data_iter: X, y = X.as_in_ctx(device), y.as_in_ctx(device) metric.add(d2l.accuracy(net(X), y), d2l.size(y)) return metric[0] / metric[1]
[docs]def train_ch6(net, train_iter, test_iter, num_epochs, lr, device): """Train a model with a GPU (defined in Chapter 6). Defined in :numref:`sec_lenet`""" net.initialize(force_reinit=True, ctx=device, init=init.Xavier()) loss = gluon.loss.SoftmaxCrossEntropyLoss() trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], legend=['train loss', 'train acc', 'test acc']) timer, num_batches = d2l.Timer(), len(train_iter) for epoch in range(num_epochs): # Sum of training loss, sum of training accuracy, no. of examples metric = d2l.Accumulator(3) for i, (X, y) in enumerate(train_iter): timer.start() # Here is the major difference from `d2l.train_epoch_ch3` X, y = X.as_in_ctx(device), y.as_in_ctx(device) with autograd.record(): y_hat = net(X) l = loss(y_hat, y) l.backward() trainer.step(X.shape[0]) metric.add(l.sum(), d2l.accuracy(y_hat, y), X.shape[0]) timer.stop() train_l = metric[0] / metric[2] train_acc = metric[1] / metric[2] if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (train_l, train_acc, None)) test_acc = evaluate_accuracy_gpu(net, test_iter) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {train_l:.3f}, train acc {train_acc:.3f}, ' f'test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(device)}')
[docs]class Residual(nn.Block): """The Residual block of ResNet.""" def __init__(self, num_channels, use_1x1conv=False, strides=1, **kwargs): super().__init__(**kwargs) self.conv1 = nn.Conv2D(num_channels, kernel_size=3, padding=1, strides=strides) self.conv2 = nn.Conv2D(num_channels, kernel_size=3, padding=1) if use_1x1conv: self.conv3 = nn.Conv2D(num_channels, kernel_size=1, strides=strides) else: self.conv3 = None self.bn1 = nn.BatchNorm() self.bn2 = nn.BatchNorm()
[docs] def forward(self, X): Y = npx.relu(self.bn1(self.conv1(X))) Y = self.bn2(self.conv2(Y)) if self.conv3: X = self.conv3(X) return npx.relu(Y + X)
d2l.DATA_HUB['time_machine'] = (d2l.DATA_URL + 'timemachine.txt', '090b5e7e70c295757f55df93cb0a180b9691891a')
[docs]def read_time_machine(): """Load the time machine dataset into a list of text lines. Defined in :numref:`sec_text_preprocessing`""" with open('time_machine'), 'r') as f: lines = f.readlines() return [re.sub('[^A-Za-z]+', ' ', line).strip().lower() for line in lines]
[docs]def tokenize(lines, token='word'): """Split text lines into word or character tokens. Defined in :numref:`sec_text_preprocessing`""" if token == 'word': return [line.split() for line in lines] elif token == 'char': return [list(line) for line in lines] else: print('ERROR: unknown token type: ' + token)
[docs]class Vocab: """Vocabulary for text.""" def __init__(self, tokens=None, min_freq=0, reserved_tokens=None): """Defined in :numref:`sec_text_preprocessing`""" if tokens is None: tokens = [] if reserved_tokens is None: reserved_tokens = [] # Sort according to frequencies counter = count_corpus(tokens) self._token_freqs = sorted(counter.items(), key=lambda x: x[1], reverse=True) # The index for the unknown token is 0 self.idx_to_token = ['<unk>'] + reserved_tokens self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)} for token, freq in self._token_freqs: if freq < min_freq: break if token not in self.token_to_idx: self.idx_to_token.append(token) self.token_to_idx[token] = len(self.idx_to_token) - 1 def __len__(self): return len(self.idx_to_token) def __getitem__(self, tokens): if not isinstance(tokens, (list, tuple)): return self.token_to_idx.get(tokens, self.unk) return [self.__getitem__(token) for token in tokens]
[docs] def to_tokens(self, indices): if not isinstance(indices, (list, tuple)): return self.idx_to_token[indices] return [self.idx_to_token[index] for index in indices]
@property def unk(self): # Index for the unknown token return 0 @property def token_freqs(self): # Index for the unknown token return self._token_freqs
[docs]def count_corpus(tokens): """Count token frequencies. Defined in :numref:`sec_text_preprocessing`""" # Here `tokens` is a 1D list or 2D list if len(tokens) == 0 or isinstance(tokens[0], list): # Flatten a list of token lists into a list of tokens tokens = [token for line in tokens for token in line] return collections.Counter(tokens)
[docs]def load_corpus_time_machine(max_tokens=-1): """Return token indices and the vocabulary of the time machine dataset. Defined in :numref:`sec_text_preprocessing`""" lines = read_time_machine() tokens = tokenize(lines, 'char') vocab = Vocab(tokens) # Since each text line in the time machine dataset is not necessarily a # sentence or a paragraph, flatten all the text lines into a single list corpus = [vocab[token] for line in tokens for token in line] if max_tokens > 0: corpus = corpus[:max_tokens] return corpus, vocab
[docs]def seq_data_iter_random(corpus, batch_size, num_steps): """Generate a minibatch of subsequences using random sampling. Defined in :numref:`sec_language_model`""" # Start with a random offset (inclusive of `num_steps - 1`) to partition a # sequence corpus = corpus[random.randint(0, num_steps - 1):] # Subtract 1 since we need to account for labels num_subseqs = (len(corpus) - 1) // num_steps # The starting indices for subsequences of length `num_steps` initial_indices = list(range(0, num_subseqs * num_steps, num_steps)) # In random sampling, the subsequences from two adjacent random # minibatches during iteration are not necessarily adjacent on the # original sequence random.shuffle(initial_indices) def data(pos): # Return a sequence of length `num_steps` starting from `pos` return corpus[pos: pos + num_steps] num_batches = num_subseqs // batch_size for i in range(0, batch_size * num_batches, batch_size): # Here, `initial_indices` contains randomized starting indices for # subsequences initial_indices_per_batch = initial_indices[i: i + batch_size] X = [data(j) for j in initial_indices_per_batch] Y = [data(j + 1) for j in initial_indices_per_batch] yield d2l.tensor(X), d2l.tensor(Y)
[docs]def seq_data_iter_sequential(corpus, batch_size, num_steps): """Generate a minibatch of subsequences using sequential partitioning. Defined in :numref:`sec_language_model`""" # Start with a random offset to partition a sequence offset = random.randint(0, num_steps) num_tokens = ((len(corpus) - offset - 1) // batch_size) * batch_size Xs = d2l.tensor(corpus[offset: offset + num_tokens]) Ys = d2l.tensor(corpus[offset + 1: offset + 1 + num_tokens]) Xs, Ys = Xs.reshape(batch_size, -1), Ys.reshape(batch_size, -1) num_batches = Xs.shape[1] // num_steps for i in range(0, num_steps * num_batches, num_steps): X = Xs[:, i: i + num_steps] Y = Ys[:, i: i + num_steps] yield X, Y
[docs]class SeqDataLoader: """An iterator to load sequence data.""" def __init__(self, batch_size, num_steps, use_random_iter, max_tokens): """Defined in :numref:`sec_language_model`""" if use_random_iter: self.data_iter_fn = d2l.seq_data_iter_random else: self.data_iter_fn = d2l.seq_data_iter_sequential self.corpus, self.vocab = d2l.load_corpus_time_machine(max_tokens) self.batch_size, self.num_steps = batch_size, num_steps def __iter__(self): return self.data_iter_fn(self.corpus, self.batch_size, self.num_steps)
[docs]def load_data_time_machine(batch_size, num_steps, use_random_iter=False, max_tokens=10000): """Return the iterator and the vocabulary of the time machine dataset. Defined in :numref:`sec_language_model`""" data_iter = SeqDataLoader( batch_size, num_steps, use_random_iter, max_tokens) return data_iter, data_iter.vocab
[docs]class RNNModelScratch: """An RNN Model implemented from scratch.""" def __init__(self, vocab_size, num_hiddens, device, get_params, init_state, forward_fn): """Defined in :numref:`sec_rnn_scratch`""" self.vocab_size, self.num_hiddens = vocab_size, num_hiddens self.params = get_params(vocab_size, num_hiddens, device) self.init_state, self.forward_fn = init_state, forward_fn def __call__(self, X, state): X = npx.one_hot(X.T, self.vocab_size) return self.forward_fn(X, state, self.params)
[docs] def begin_state(self, batch_size, ctx): return self.init_state(batch_size, self.num_hiddens, ctx)
[docs]def predict_ch8(prefix, num_preds, net, vocab, device): """Generate new characters following the `prefix`. Defined in :numref:`sec_rnn_scratch`""" state = net.begin_state(batch_size=1, ctx=device) outputs = [vocab[prefix[0]]] get_input = lambda: d2l.reshape( d2l.tensor([outputs[-1]], ctx=device), (1, 1)) for y in prefix[1:]: # Warm-up period _, state = net(get_input(), state) outputs.append(vocab[y]) for _ in range(num_preds): # Predict `num_preds` steps y, state = net(get_input(), state) outputs.append(int(y.argmax(axis=1).reshape(1))) return ''.join([vocab.idx_to_token[i] for i in outputs])
[docs]def grad_clipping(net, theta): """Clip the gradient. Defined in :numref:`sec_rnn_scratch`""" if isinstance(net, gluon.Block): params = [ for p in net.collect_params().values()] else: params = net.params norm = math.sqrt(sum((p.grad ** 2).sum() for p in params)) if norm > theta: for param in params: param.grad[:] *= theta / norm
[docs]def train_epoch_ch8(net, train_iter, loss, updater, device, use_random_iter): """Train a model within one epoch (defined in Chapter 8). Defined in :numref:`sec_rnn_scratch`""" state, timer = None, d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for X, Y in train_iter: if state is None or use_random_iter: # Initialize `state` when either it is the first iteration or # using random sampling state = net.begin_state(batch_size=X.shape[0], ctx=device) else: for s in state: s.detach() y = Y.T.reshape(-1) X, y = X.as_in_ctx(device), y.as_in_ctx(device) with autograd.record(): y_hat, state = net(X, state) l = loss(y_hat, y).mean() l.backward() grad_clipping(net, 1) updater(batch_size=1) # Since the `mean` function has been invoked metric.add(l * d2l.size(y), d2l.size(y)) return math.exp(metric[0] / metric[1]), metric[1] / timer.stop()
[docs]def train_ch8(net, train_iter, vocab, lr, num_epochs, device, use_random_iter=False): """Train a model (defined in Chapter 8). Defined in :numref:`sec_rnn_scratch`""" loss = gluon.loss.SoftmaxCrossEntropyLoss() animator = d2l.Animator(xlabel='epoch', ylabel='perplexity', legend=['train'], xlim=[10, num_epochs]) # Initialize if isinstance(net, gluon.Block): net.initialize(ctx=device, force_reinit=True, init=init.Normal(0.01)) trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': lr}) updater = lambda batch_size: trainer.step(batch_size) else: updater = lambda batch_size: d2l.sgd(net.params, lr, batch_size) predict = lambda prefix: predict_ch8(prefix, 50, net, vocab, device) # Train and predict for epoch in range(num_epochs): ppl, speed = train_epoch_ch8( net, train_iter, loss, updater, device, use_random_iter) if (epoch + 1) % 10 == 0: animator.add(epoch + 1, [ppl]) print(f'perplexity {ppl:.1f}, {speed:.1f} tokens/sec on {str(device)}') print(predict('time traveller')) print(predict('traveller'))
[docs]class RNNModel(nn.Block): """The RNN model. Defined in :numref:`sec_rnn-concise`""" def __init__(self, rnn_layer, vocab_size, **kwargs): super(RNNModel, self).__init__(**kwargs) self.rnn = rnn_layer self.vocab_size = vocab_size self.dense = nn.Dense(vocab_size)
[docs] def forward(self, inputs, state): X = npx.one_hot(inputs.T, self.vocab_size) Y, state = self.rnn(X, state) # The fully-connected layer will first change the shape of `Y` to # (`num_steps` * `batch_size`, `num_hiddens`). Its output shape is # (`num_steps` * `batch_size`, `vocab_size`). output = self.dense(Y.reshape(-1, Y.shape[-1])) return output, state
[docs] def begin_state(self, *args, **kwargs): return self.rnn.begin_state(*args, **kwargs)
d2l.DATA_HUB['fra-eng'] = (d2l.DATA_URL + '', '94646ad1522d915e7b0f9296181140edcf86a4f5')
[docs]def read_data_nmt(): """Load the English-French dataset. Defined in :numref:`sec_machine_translation`""" data_dir = d2l.download_extract('fra-eng') with open(os.path.join(data_dir, 'fra.txt'), 'r') as f: return
[docs]def preprocess_nmt(text): """Preprocess the English-French dataset. Defined in :numref:`sec_machine_translation`""" def no_space(char, prev_char): return char in set(',.!?') and prev_char != ' ' # Replace non-breaking space with space, and convert uppercase letters to # lowercase ones text = text.replace('\u202f', ' ').replace('\xa0', ' ').lower() # Insert space between words and punctuation marks out = [' ' + char if i > 0 and no_space(char, text[i - 1]) else char for i, char in enumerate(text)] return ''.join(out)
[docs]def tokenize_nmt(text, num_examples=None): """Tokenize the English-French dataset. Defined in :numref:`sec_machine_translation`""" source, target = [], [] for i, line in enumerate(text.split('\n')): if num_examples and i > num_examples: break parts = line.split('\t') if len(parts) == 2: source.append(parts[0].split(' ')) target.append(parts[1].split(' ')) return source, target
[docs]def show_list_len_pair_hist(legend, xlabel, ylabel, xlist, ylist): """Plot the histogram for list length pairs. Defined in :numref:`sec_machine_translation`""" d2l.set_figsize() _, _, patches = d2l.plt.hist( [[len(l) for l in xlist], [len(l) for l in ylist]]) d2l.plt.xlabel(xlabel) d2l.plt.ylabel(ylabel) for patch in patches[1].patches: patch.set_hatch('/') d2l.plt.legend(legend)
[docs]def truncate_pad(line, num_steps, padding_token): """Truncate or pad sequences. Defined in :numref:`sec_machine_translation`""" if len(line) > num_steps: return line[:num_steps] # Truncate return line + [padding_token] * (num_steps - len(line)) # Pad
[docs]def build_array_nmt(lines, vocab, num_steps): """Transform text sequences of machine translation into minibatches. Defined in :numref:`subsec_mt_data_loading`""" lines = [vocab[l] for l in lines] lines = [l + [vocab['<eos>']] for l in lines] array = d2l.tensor([truncate_pad( l, num_steps, vocab['<pad>']) for l in lines]) valid_len = d2l.reduce_sum( d2l.astype(array != vocab['<pad>'], d2l.int32), 1) return array, valid_len
[docs]def load_data_nmt(batch_size, num_steps, num_examples=600): """Return the iterator and the vocabularies of the translation dataset. Defined in :numref:`subsec_mt_data_loading`""" text = preprocess_nmt(read_data_nmt()) source, target = tokenize_nmt(text, num_examples) src_vocab = d2l.Vocab(source, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>']) tgt_vocab = d2l.Vocab(target, min_freq=2, reserved_tokens=['<pad>', '<bos>', '<eos>']) src_array, src_valid_len = build_array_nmt(source, src_vocab, num_steps) tgt_array, tgt_valid_len = build_array_nmt(target, tgt_vocab, num_steps) data_arrays = (src_array, src_valid_len, tgt_array, tgt_valid_len) data_iter = d2l.load_array(data_arrays, batch_size) return data_iter, src_vocab, tgt_vocab
[docs]class Encoder(nn.Block): """The base encoder interface for the encoder-decoder architecture.""" def __init__(self, **kwargs): super(Encoder, self).__init__(**kwargs)
[docs] def forward(self, X, *args): raise NotImplementedError
[docs]class Decoder(nn.Block): """The base decoder interface for the encoder-decoder architecture. Defined in :numref:`sec_encoder-decoder`""" def __init__(self, **kwargs): super(Decoder, self).__init__(**kwargs)
[docs] def init_state(self, enc_outputs, *args): raise NotImplementedError
[docs] def forward(self, X, state): raise NotImplementedError
[docs]class EncoderDecoder(nn.Block): """The base class for the encoder-decoder architecture. Defined in :numref:`sec_encoder-decoder`""" def __init__(self, encoder, decoder, **kwargs): super(EncoderDecoder, self).__init__(**kwargs) self.encoder = encoder self.decoder = decoder
[docs] def forward(self, enc_X, dec_X, *args): enc_outputs = self.encoder(enc_X, *args) dec_state = self.decoder.init_state(enc_outputs, *args) return self.decoder(dec_X, dec_state)
[docs]class Seq2SeqEncoder(d2l.Encoder): """The RNN encoder for sequence to sequence learning. Defined in :numref:`sec_seq2seq`""" def __init__(self, vocab_size, embed_size, num_hiddens, num_layers, dropout=0, **kwargs): super(Seq2SeqEncoder, self).__init__(**kwargs) # Embedding layer self.embedding = nn.Embedding(vocab_size, embed_size) self.rnn = rnn.GRU(num_hiddens, num_layers, dropout=dropout)
[docs] def forward(self, X, *args): # The output `X` shape: (`batch_size`, `num_steps`, `embed_size`) X = self.embedding(X) # In RNN models, the first axis corresponds to time steps X = X.swapaxes(0, 1) state = self.rnn.begin_state(batch_size=X.shape[1], ctx=X.ctx) output, state = self.rnn(X, state) # `output` shape: (`num_steps`, `batch_size`, `num_hiddens`) # `state[0]` shape: (`num_layers`, `batch_size`, `num_hiddens`) return output, state
[docs]class MaskedSoftmaxCELoss(gluon.loss.SoftmaxCELoss): """The softmax cross-entropy loss with masks. Defined in :numref:`sec_seq2seq_decoder`""" # `pred` shape: (`batch_size`, `num_steps`, `vocab_size`) # `label` shape: (`batch_size`, `num_steps`) # `valid_len` shape: (`batch_size`,)
[docs] def forward(self, pred, label, valid_len): # `weights` shape: (`batch_size`, `num_steps`, 1) weights = np.expand_dims(np.ones_like(label), axis=-1) weights = npx.sequence_mask(weights, valid_len, True, axis=1) return super(MaskedSoftmaxCELoss, self).forward(pred, label, weights)
[docs]def train_seq2seq(net, data_iter, lr, num_epochs, tgt_vocab, device): """Train a model for sequence to sequence. Defined in :numref:`sec_seq2seq_decoder`""" net.initialize(init.Xavier(), force_reinit=True, ctx=device) trainer = gluon.Trainer(net.collect_params(), 'adam', {'learning_rate': lr}) loss = MaskedSoftmaxCELoss() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[10, num_epochs]) for epoch in range(num_epochs): timer = d2l.Timer() metric = d2l.Accumulator(2) # Sum of training loss, no. of tokens for batch in data_iter: X, X_valid_len, Y, Y_valid_len = [ x.as_in_ctx(device) for x in batch] bos = np.array( [tgt_vocab['<bos>']] * Y.shape[0], ctx=device).reshape(-1, 1) dec_input = d2l.concat([bos, Y[:, :-1]], 1) # Teacher forcing with autograd.record(): Y_hat, _ = net(X, dec_input, X_valid_len) l = loss(Y_hat, Y, Y_valid_len) l.backward() d2l.grad_clipping(net, 1) num_tokens = Y_valid_len.sum() trainer.step(num_tokens) metric.add(l.sum(), num_tokens) if (epoch + 1) % 10 == 0: animator.add(epoch + 1, (metric[0] / metric[1],)) print(f'loss {metric[0] / metric[1]:.3f}, {metric[1] / timer.stop():.1f} ' f'tokens/sec on {str(device)}')
[docs]def predict_seq2seq(net, src_sentence, src_vocab, tgt_vocab, num_steps, device, save_attention_weights=False): """Predict for sequence to sequence. Defined in :numref:`sec_seq2seq_training`""" src_tokens = src_vocab[src_sentence.lower().split(' ')] + [ src_vocab['<eos>']] enc_valid_len = np.array([len(src_tokens)], ctx=device) src_tokens = d2l.truncate_pad(src_tokens, num_steps, src_vocab['<pad>']) # Add the batch axis enc_X = np.expand_dims(np.array(src_tokens, ctx=device), axis=0) enc_outputs = net.encoder(enc_X, enc_valid_len) dec_state = net.decoder.init_state(enc_outputs, enc_valid_len) # Add the batch axis dec_X = np.expand_dims(np.array([tgt_vocab['<bos>']], ctx=device), axis=0) output_seq, attention_weight_seq = [], [] for _ in range(num_steps): Y, dec_state = net.decoder(dec_X, dec_state) # We use the token with the highest prediction likelihood as the input # of the decoder at the next time step dec_X = Y.argmax(axis=2) pred = dec_X.squeeze(axis=0).astype('int32').item() # Save attention weights (to be covered later) if save_attention_weights: attention_weight_seq.append(net.decoder.attention_weights) # Once the end-of-sequence token is predicted, the generation of the # output sequence is complete if pred == tgt_vocab['<eos>']: break output_seq.append(pred) return ' '.join(tgt_vocab.to_tokens(output_seq)), attention_weight_seq
[docs]def bleu(pred_seq, label_seq, k): """Compute the BLEU. Defined in :numref:`sec_seq2seq_training`""" pred_tokens, label_tokens = pred_seq.split(' '), label_seq.split(' ') len_pred, len_label = len(pred_tokens), len(label_tokens) score = math.exp(min(0, 1 - len_label / len_pred)) for n in range(1, k + 1): num_matches, label_subs = 0, collections.defaultdict(int) for i in range(len_label - n + 1): label_subs[' '.join(label_tokens[i: i + n])] += 1 for i in range(len_pred - n + 1): if label_subs[' '.join(pred_tokens[i: i + n])] > 0: num_matches += 1 label_subs[' '.join(pred_tokens[i: i + n])] -= 1 score *= math.pow(num_matches / (len_pred - n + 1), math.pow(0.5, n)) return score
[docs]def show_heatmaps(matrices, xlabel, ylabel, titles=None, figsize=(2.5, 2.5), cmap='Reds'): """Show heatmaps of matrices. Defined in :numref:`sec_attention-cues`""" d2l.use_svg_display() num_rows, num_cols = matrices.shape[0], matrices.shape[1] fig, axes = d2l.plt.subplots(num_rows, num_cols, figsize=figsize, sharex=True, sharey=True, squeeze=False) for i, (row_axes, row_matrices) in enumerate(zip(axes, matrices)): for j, (ax, matrix) in enumerate(zip(row_axes, row_matrices)): pcm = ax.imshow(d2l.numpy(matrix), cmap=cmap) if i == num_rows - 1: ax.set_xlabel(xlabel) if j == 0: ax.set_ylabel(ylabel) if titles: ax.set_title(titles[j]) fig.colorbar(pcm, ax=axes, shrink=0.6);
[docs]def masked_softmax(X, valid_lens): """Perform softmax operation by masking elements on the last axis. Defined in :numref:`sec_attention-scoring-functions`""" # `X`: 3D tensor, `valid_lens`: 1D or 2D tensor if valid_lens is None: return npx.softmax(X) else: shape = X.shape if valid_lens.ndim == 1: valid_lens = valid_lens.repeat(shape[1]) else: valid_lens = valid_lens.reshape(-1) # On the last axis, replace masked elements with a very large negative # value, whose exponentiation outputs 0 X = npx.sequence_mask(X.reshape(-1, shape[-1]), valid_lens, True, value=-1e6, axis=1) return npx.softmax(X).reshape(shape)
[docs]class AdditiveAttention(nn.Block): """Additive attention. Defined in :numref:`sec_attention-scoring-functions`""" def __init__(self, num_hiddens, dropout, **kwargs): super(AdditiveAttention, self).__init__(**kwargs) # Use `flatten=False` to only transform the last axis so that the # shapes for the other axes are kept the same self.W_k = nn.Dense(num_hiddens, use_bias=False, flatten=False) self.W_q = nn.Dense(num_hiddens, use_bias=False, flatten=False) self.w_v = nn.Dense(1, use_bias=False, flatten=False) self.dropout = nn.Dropout(dropout)
[docs] def forward(self, queries, keys, values, valid_lens): queries, keys = self.W_q(queries), self.W_k(keys) # After dimension expansion, shape of `queries`: (`batch_size`, no. of # queries, 1, `num_hiddens`) and shape of `keys`: (`batch_size`, 1, # no. of key-value pairs, `num_hiddens`). Sum them up with # broadcasting features = np.expand_dims(queries, axis=2) + np.expand_dims( keys, axis=1) features = np.tanh(features) # There is only one output of `self.w_v`, so we remove the last # one-dimensional entry from the shape. Shape of `scores`: # (`batch_size`, no. of queries, no. of key-value pairs) scores = np.squeeze(self.w_v(features), axis=-1) self.attention_weights = masked_softmax(scores, valid_lens) # Shape of `values`: (`batch_size`, no. of key-value pairs, value # dimension) return npx.batch_dot(self.dropout(self.attention_weights), values)
[docs]class DotProductAttention(nn.Block): """Scaled dot product attention. Defined in :numref:`subsec_additive-attention`""" def __init__(self, dropout, **kwargs): super(DotProductAttention, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) # Shape of `queries`: (`batch_size`, no. of queries, `d`) # Shape of `keys`: (`batch_size`, no. of key-value pairs, `d`) # Shape of `values`: (`batch_size`, no. of key-value pairs, value # dimension) # Shape of `valid_lens`: (`batch_size`,) or (`batch_size`, no. of queries)
[docs] def forward(self, queries, keys, values, valid_lens=None): d = queries.shape[-1] # Set `transpose_b=True` to swap the last two dimensions of `keys` scores = npx.batch_dot(queries, keys, transpose_b=True) / math.sqrt(d) self.attention_weights = masked_softmax(scores, valid_lens) return npx.batch_dot(self.dropout(self.attention_weights), values)
[docs]class AttentionDecoder(d2l.Decoder): """The base attention-based decoder interface. Defined in :numref:`sec_seq2seq_attention`""" def __init__(self, **kwargs): super(AttentionDecoder, self).__init__(**kwargs) @property def attention_weights(self): raise NotImplementedError
[docs]class MultiHeadAttention(nn.Block): """Multi-head attention. Defined in :numref:`sec_multihead-attention`""" def __init__(self, num_hiddens, num_heads, dropout, use_bias=False, **kwargs): super(MultiHeadAttention, self).__init__(**kwargs) self.num_heads = num_heads self.attention = d2l.DotProductAttention(dropout) self.W_q = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_k = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_v = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False) self.W_o = nn.Dense(num_hiddens, use_bias=use_bias, flatten=False)
[docs] def forward(self, queries, keys, values, valid_lens): # Shape of `queries`, `keys`, or `values`: # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`) # Shape of `valid_lens`: # (`batch_size`,) or (`batch_size`, no. of queries) # After transposing, shape of output `queries`, `keys`, or `values`: # (`batch_size` * `num_heads`, no. of queries or key-value pairs, # `num_hiddens` / `num_heads`) queries = transpose_qkv(self.W_q(queries), self.num_heads) keys = transpose_qkv(self.W_k(keys), self.num_heads) values = transpose_qkv(self.W_v(values), self.num_heads) if valid_lens is not None: # On axis 0, copy the first item (scalar or vector) for # `num_heads` times, then copy the next item, and so on valid_lens = valid_lens.repeat(self.num_heads, axis=0) # Shape of `output`: (`batch_size` * `num_heads`, no. of queries, # `num_hiddens` / `num_heads`) output = self.attention(queries, keys, values, valid_lens) # Shape of `output_concat`: # (`batch_size`, no. of queries, `num_hiddens`) output_concat = transpose_output(output, self.num_heads) return self.W_o(output_concat)
[docs]def transpose_qkv(X, num_heads): """Transposition for parallel computation of multiple attention heads. Defined in :numref:`sec_multihead-attention`""" # Shape of input `X`: # (`batch_size`, no. of queries or key-value pairs, `num_hiddens`). # Shape of output `X`: # (`batch_size`, no. of queries or key-value pairs, `num_heads`, # `num_hiddens` / `num_heads`) X = X.reshape(X.shape[0], X.shape[1], num_heads, -1) # Shape of output `X`: # (`batch_size`, `num_heads`, no. of queries or key-value pairs, # `num_hiddens` / `num_heads`) X = X.transpose(0, 2, 1, 3) # Shape of `output`: # (`batch_size` * `num_heads`, no. of queries or key-value pairs, # `num_hiddens` / `num_heads`) return X.reshape(-1, X.shape[2], X.shape[3])
[docs]def transpose_output(X, num_heads): """Reverse the operation of `transpose_qkv`. Defined in :numref:`sec_multihead-attention`""" X = X.reshape(-1, num_heads, X.shape[1], X.shape[2]) X = X.transpose(0, 2, 1, 3) return X.reshape(X.shape[0], X.shape[1], -1)
[docs]class PositionalEncoding(nn.Block): """Positional encoding. Defined in :numref:`sec_self-attention-and-positional-encoding`""" def __init__(self, num_hiddens, dropout, max_len=1000): super(PositionalEncoding, self).__init__() self.dropout = nn.Dropout(dropout) # Create a long enough `P` self.P = d2l.zeros((1, max_len, num_hiddens)) X = d2l.arange(max_len).reshape(-1, 1) / np.power( 10000, np.arange(0, num_hiddens, 2) / num_hiddens) self.P[:, :, 0::2] = np.sin(X) self.P[:, :, 1::2] = np.cos(X)
[docs] def forward(self, X): X = X + self.P[:, :X.shape[1], :].as_in_ctx(X.ctx) return self.dropout(X)
[docs]class PositionWiseFFN(nn.Block): """Positionwise feed-forward network. Defined in :numref:`sec_transformer`""" def __init__(self, ffn_num_hiddens, ffn_num_outputs, **kwargs): super(PositionWiseFFN, self).__init__(**kwargs) self.dense1 = nn.Dense(ffn_num_hiddens, flatten=False, activation='relu') self.dense2 = nn.Dense(ffn_num_outputs, flatten=False)
[docs] def forward(self, X): return self.dense2(self.dense1(X))
[docs]class AddNorm(nn.Block): """Residual connection followed by layer normalization. Defined in :numref:`sec_transformer`""" def __init__(self, dropout, **kwargs): super(AddNorm, self).__init__(**kwargs) self.dropout = nn.Dropout(dropout) self.ln = nn.LayerNorm()
[docs] def forward(self, X, Y): return self.ln(self.dropout(Y) + X)
[docs]class EncoderBlock(nn.Block): """Transformer encoder block. Defined in :numref:`sec_transformer`""" def __init__(self, num_hiddens, ffn_num_hiddens, num_heads, dropout, use_bias=False, **kwargs): super(EncoderBlock, self).__init__(**kwargs) self.attention = d2l.MultiHeadAttention( num_hiddens, num_heads, dropout, use_bias) self.addnorm1 = AddNorm(dropout) self.ffn = PositionWiseFFN(ffn_num_hiddens, num_hiddens) self.addnorm2 = AddNorm(dropout)
[docs] def forward(self, X, valid_lens): Y = self.addnorm1(X, self.attention(X, X, X, valid_lens)) return self.addnorm2(Y, self.ffn(Y))
[docs]class TransformerEncoder(d2l.Encoder): """Transformer encoder. Defined in :numref:`sec_transformer`""" def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, use_bias=False, **kwargs): super(TransformerEncoder, self).__init__(**kwargs) self.num_hiddens = num_hiddens self.embedding = nn.Embedding(vocab_size, num_hiddens) self.pos_encoding = d2l.PositionalEncoding(num_hiddens, dropout) self.blks = nn.Sequential() for _ in range(num_layers): self.blks.add( EncoderBlock(num_hiddens, ffn_num_hiddens, num_heads, dropout, use_bias))
[docs] def forward(self, X, valid_lens, *args): # Since positional encoding values are between -1 and 1, the embedding # values are multiplied by the square root of the embedding dimension # to rescale before they are summed up X = self.pos_encoding(self.embedding(X) * math.sqrt(self.num_hiddens)) self.attention_weights = [None] * len(self.blks) for i, blk in enumerate(self.blks): X = blk(X, valid_lens) self.attention_weights[ i] = blk.attention.attention.attention_weights return X
[docs]def annotate(text, xy, xytext): d2l.plt.gca().annotate(text, xy=xy, xytext=xytext, arrowprops=dict(arrowstyle='->'))
[docs]def train_2d(trainer, steps=20, f_grad=None): """Optimize a 2D objective function with a customized trainer. Defined in :numref:`subsec_gd-learningrate`""" # `s1` and `s2` are internal state variables that will be used later x1, x2, s1, s2 = -5, -2, 0, 0 results = [(x1, x2)] for i in range(steps): if f_grad: x1, x2, s1, s2 = trainer(x1, x2, s1, s2, f_grad) else: x1, x2, s1, s2 = trainer(x1, x2, s1, s2) results.append((x1, x2)) print(f'epoch {i + 1}, x1: {float(x1):f}, x2: {float(x2):f}') return results
[docs]def show_trace_2d(f, results): """Show the trace of 2D variables during optimization. Defined in :numref:`subsec_gd-learningrate`""" d2l.set_figsize() d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e') x1, x2 = d2l.meshgrid(d2l.arange(-5.5, 1.0, 0.1), d2l.arange(-3.0, 1.0, 0.1)) d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4') d2l.plt.xlabel('x1') d2l.plt.ylabel('x2')
d2l.DATA_HUB['airfoil'] = (d2l.DATA_URL + 'airfoil_self_noise.dat', '76e5be1548fd8222e5074cf0faae75edff8cf93f')
[docs]def get_data_ch11(batch_size=10, n=1500): """Defined in :numref:`sec_minibatches`""" data = np.genfromtxt('airfoil'), dtype=np.float32, delimiter='\t') data = (data - data.mean(axis=0)) / data.std(axis=0) data_iter = d2l.load_array( (data[:n, :-1], data[:n, -1]), batch_size, is_train=True) return data_iter, data.shape[1]-1
[docs]def train_ch11(trainer_fn, states, hyperparams, data_iter, feature_dim, num_epochs=2): """Defined in :numref:`sec_minibatches`""" # Initialization w = np.random.normal(scale=0.01, size=(feature_dim, 1)) b = np.zeros(1) w.attach_grad() b.attach_grad() net, loss = lambda X: d2l.linreg(X, w, b), d2l.squared_loss # Train animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with autograd.record(): l = loss(net(X), y).mean() l.backward() trainer_fn([w, b], states, hyperparams) n += X.shape[0] if n % 200 == 0: timer.stop() animator.add(n/X.shape[0]/len(data_iter), (d2l.evaluate_loss(net, data_iter, loss),)) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch') return timer.cumsum(), animator.Y[0]
[docs]def train_concise_ch11(tr_name, hyperparams, data_iter, num_epochs=2): """Defined in :numref:`sec_minibatches`""" # Initialization net = nn.Sequential() net.add(nn.Dense(1)) net.initialize(init.Normal(sigma=0.01)) trainer = gluon.Trainer(net.collect_params(), tr_name, hyperparams) loss = gluon.loss.L2Loss() animator = d2l.Animator(xlabel='epoch', ylabel='loss', xlim=[0, num_epochs], ylim=[0.22, 0.35]) n, timer = 0, d2l.Timer() for _ in range(num_epochs): for X, y in data_iter: with autograd.record(): l = loss(net(X), y) l.backward() trainer.step(X.shape[0]) n += X.shape[0] if n % 200 == 0: timer.stop() animator.add(n/X.shape[0]/len(data_iter), (d2l.evaluate_loss(net, data_iter, loss),)) timer.start() print(f'loss: {animator.Y[0][-1]:.3f}, {timer.avg():.3f} sec/epoch')
[docs]class Benchmark: """For measuring running time.""" def __init__(self, description='Done'): """Defined in :numref:`sec_hybridize`""" self.description = description def __enter__(self): self.timer = d2l.Timer() return self def __exit__(self, *args): print(f'{self.description}: {self.timer.stop():.4f} sec')
[docs]def split_batch(X, y, devices): """Split `X` and `y` into multiple devices. Defined in :numref:`sec_multi_gpu`""" assert X.shape[0] == y.shape[0] return (gluon.utils.split_and_load(X, devices), gluon.utils.split_and_load(y, devices))
[docs]def resnet18(num_classes): """A slightly modified ResNet-18 model. Defined in :numref:`sec_multi_gpu_concise`""" def resnet_block(num_channels, num_residuals, first_block=False): blk = nn.Sequential() for i in range(num_residuals): if i == 0 and not first_block: blk.add(d2l.Residual( num_channels, use_1x1conv=True, strides=2)) else: blk.add(d2l.Residual(num_channels)) return blk net = nn.Sequential() # This model uses a smaller convolution kernel, stride, and padding and # removes the maximum pooling layer net.add(nn.Conv2D(64, kernel_size=3, strides=1, padding=1), nn.BatchNorm(), nn.Activation('relu')) net.add(resnet_block(64, 2, first_block=True), resnet_block(128, 2), resnet_block(256, 2), resnet_block(512, 2)) net.add(nn.GlobalAvgPool2D(), nn.Dense(num_classes)) return net
[docs]def evaluate_accuracy_gpus(net, data_iter, split_f=d2l.split_batch): """Compute the accuracy for a model on a dataset using multiple GPUs. Defined in :numref:`sec_multi_gpu_concise`""" # Query the list of devices devices = list(net.collect_params().values())[0].list_ctx() # No. of correct predictions, no. of predictions metric = d2l.Accumulator(2) for features, labels in data_iter: X_shards, y_shards = split_f(features, labels, devices) # Run in parallel pred_shards = [net(X_shard) for X_shard in X_shards] metric.add(sum(float(d2l.accuracy(pred_shard, y_shard)) for pred_shard, y_shard in zip( pred_shards, y_shards)), labels.size) return metric[0] / metric[1]
[docs]def train_batch_ch13(net, features, labels, loss, trainer, devices, split_f=d2l.split_batch): """Train for a minibatch with mutiple GPUs (defined in Chapter 13). Defined in :numref:`sec_image_augmentation`""" X_shards, y_shards = split_f(features, labels, devices) with autograd.record(): pred_shards = [net(X_shard) for X_shard in X_shards] ls = [loss(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)] for l in ls: l.backward() # The `True` flag allows parameters with stale gradients, which is useful # later (e.g., in fine-tuning BERT) trainer.step(labels.shape[0], ignore_stale_grad=True) train_loss_sum = sum([float(l.sum()) for l in ls]) train_acc_sum = sum(d2l.accuracy(pred_shard, y_shard) for pred_shard, y_shard in zip(pred_shards, y_shards)) return train_loss_sum, train_acc_sum
[docs]def train_ch13(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), split_f=d2l.split_batch): """Train a model with mutiple GPUs (defined in Chapter 13). Defined in :numref:`sec_image_augmentation`""" timer, num_batches = d2l.Timer(), len(train_iter) animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1], legend=['train loss', 'train acc', 'test acc']) for epoch in range(num_epochs): # Sum of training loss, sum of training accuracy, no. of examples, # no. of predictions metric = d2l.Accumulator(4) for i, (features, labels) in enumerate(train_iter): timer.start() l, acc = train_batch_ch13( net, features, labels, loss, trainer, devices, split_f) metric.add(l, acc, labels.shape[0], labels.size) timer.stop() if (i + 1) % (num_batches // 5) == 0 or i == num_batches - 1: animator.add(epoch + (i + 1) / num_batches, (metric[0] / metric[2], metric[1] / metric[3], None)) test_acc = d2l.evaluate_accuracy_gpus(net, test_iter, split_f) animator.add(epoch + 1, (None, None, test_acc)) print(f'loss {metric[0] / metric[2]:.3f}, train acc ' f'{metric[1] / metric[3]:.3f}, test acc {test_acc:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec on ' f'{str(devices)}')
d2l.DATA_HUB['hotdog'] = (d2l.DATA_URL + '', 'fba480ffa8aa7e0febbb511d181409f899b9baa5')
[docs]def box_corner_to_center(boxes): """Convert from (upper-left, lower-right) to (center, width, height). Defined in :numref:`sec_bbox`""" x1, y1, x2, y2 = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] cx = (x1 + x2) / 2 cy = (y1 + y2) / 2 w = x2 - x1 h = y2 - y1 boxes = d2l.stack((cx, cy, w, h), axis=-1) return boxes
[docs]def box_center_to_corner(boxes): """Convert from (center, width, height) to (upper-left, lower-right). Defined in :numref:`sec_bbox`""" cx, cy, w, h = boxes[:, 0], boxes[:, 1], boxes[:, 2], boxes[:, 3] x1 = cx - 0.5 * w y1 = cy - 0.5 * h x2 = cx + 0.5 * w y2 = cy + 0.5 * h boxes = d2l.stack((x1, y1, x2, y2), axis=-1) return boxes
[docs]def bbox_to_rect(bbox, color): """Convert bounding box to matplotlib format. Defined in :numref:`sec_bbox`""" # Convert the bounding box (upper-left x, upper-left y, lower-right x, # lower-right y) format to the matplotlib format: ((upper-left x, # upper-left y), width, height) return d2l.plt.Rectangle( xy=(bbox[0], bbox[1]), width=bbox[2]-bbox[0], height=bbox[3]-bbox[1], fill=False, edgecolor=color, linewidth=2)
[docs]def multibox_prior(data, sizes, ratios): """Generate anchor boxes with different shapes centered on each pixel. Defined in :numref:`sec_anchor`""" in_height, in_width = data.shape[-2:] device, num_sizes, num_ratios = data.ctx, len(sizes), len(ratios) boxes_per_pixel = (num_sizes + num_ratios - 1) size_tensor = d2l.tensor(sizes, ctx=device) ratio_tensor = d2l.tensor(ratios, ctx=device) # Offsets are required to move the anchor to the center of a pixel. Since # a pixel has height=1 and width=1, we choose to offset our centers by 0.5 offset_h, offset_w = 0.5, 0.5 steps_h = 1.0 / in_height # Scaled steps in y-axis steps_w = 1.0 / in_width # Scaled steps in x-axis # Generate all center points for the anchor boxes center_h = (d2l.arange(in_height, ctx=device) + offset_h) * steps_h center_w = (d2l.arange(in_width, ctx=device) + offset_w) * steps_w shift_x, shift_y = d2l.meshgrid(center_w, center_h) shift_x, shift_y = shift_x.reshape(-1), shift_y.reshape(-1) # Generate `boxes_per_pixel` number of heights and widths that are later # used to create anchor box corner coordinates (xmin, xmax, ymin, ymax) w = np.concatenate((size_tensor * np.sqrt(ratio_tensor[0]), sizes[0] * np.sqrt(ratio_tensor[1:]))) \ * in_height / in_width # Handle rectangular inputs h = np.concatenate((size_tensor / np.sqrt(ratio_tensor[0]), sizes[0] / np.sqrt(ratio_tensor[1:]))) # Divide by 2 to get half height and half width anchor_manipulations = np.tile(np.stack((-w, -h, w, h)).T, (in_height * in_width, 1)) / 2 # Each center point will have `boxes_per_pixel` number of anchor boxes, so # generate a grid of all anchor box centers with `boxes_per_pixel` repeats out_grid = d2l.stack([shift_x, shift_y, shift_x, shift_y], axis=1).repeat(boxes_per_pixel, axis=0) output = out_grid + anchor_manipulations return np.expand_dims(output, axis=0)
[docs]def show_bboxes(axes, bboxes, labels=None, colors=None): """Show bounding boxes. Defined in :numref:`sec_anchor`""" def make_list(obj, default_values=None): if obj is None: obj = default_values elif not isinstance(obj, (list, tuple)): obj = [obj] return obj labels = make_list(labels) colors = make_list(colors, ['b', 'g', 'r', 'm', 'c']) for i, bbox in enumerate(bboxes): color = colors[i % len(colors)] rect = d2l.bbox_to_rect(d2l.numpy(bbox), color) axes.add_patch(rect) if labels and len(labels) > i: text_color = 'k' if color == 'w' else 'w' axes.text(rect.xy[0], rect.xy[1], labels[i], va='center', ha='center', fontsize=9, color=text_color, bbox=dict(facecolor=color, lw=0))
[docs]def box_iou(boxes1, boxes2): """Compute pairwise IoU across two lists of anchor or bounding boxes. Defined in :numref:`sec_anchor`""" box_area = lambda boxes: ((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])) # Shape of `boxes1`, `boxes2`, `areas1`, `areas2`: (no. of boxes1, 4), # (no. of boxes2, 4), (no. of boxes1,), (no. of boxes2,) areas1 = box_area(boxes1) areas2 = box_area(boxes2) # Shape of `inter_upperlefts`, `inter_lowerrights`, `inters`: (no. of # boxes1, no. of boxes2, 2) inter_upperlefts = np.maximum(boxes1[:, None, :2], boxes2[:, :2]) inter_lowerrights = np.minimum(boxes1[:, None, 2:], boxes2[:, 2:]) inters = (inter_lowerrights - inter_upperlefts).clip(min=0) # Shape of `inter_areas` and `union_areas`: (no. of boxes1, no. of boxes2) inter_areas = inters[:, :, 0] * inters[:, :, 1] union_areas = areas1[:, None] + areas2 - inter_areas return inter_areas / union_areas
[docs]def assign_anchor_to_bbox(ground_truth, anchors, device, iou_threshold=0.5): """Assign closest ground-truth bounding boxes to anchor boxes. Defined in :numref:`sec_anchor`""" num_anchors, num_gt_boxes = anchors.shape[0], ground_truth.shape[0] # Element x_ij in the i-th row and j-th column is the IoU of the anchor # box i and the ground-truth bounding box j jaccard = box_iou(anchors, ground_truth) # Initialize the tensor to hold the assigned ground-truth bounding box for # each anchor anchors_bbox_map = np.full((num_anchors,), -1, dtype=np.int32, ctx=device) # Assign ground-truth bounding boxes according to the threshold max_ious, indices = np.max(jaccard, axis=1), np.argmax(jaccard, axis=1) anc_i = np.nonzero(max_ious >= 0.5)[0] box_j = indices[max_ious >= 0.5] anchors_bbox_map[anc_i] = box_j col_discard = np.full((num_anchors,), -1) row_discard = np.full((num_gt_boxes,), -1) for _ in range(num_gt_boxes): max_idx = np.argmax(jaccard) # Find the largest IoU box_idx = (max_idx % num_gt_boxes).astype('int32') anc_idx = (max_idx / num_gt_boxes).astype('int32') anchors_bbox_map[anc_idx] = box_idx jaccard[:, box_idx] = col_discard jaccard[anc_idx, :] = row_discard return anchors_bbox_map
[docs]def offset_boxes(anchors, assigned_bb, eps=1e-6): """Transform for anchor box offsets. Defined in :numref:`subsec_labeling-anchor-boxes`""" c_anc = d2l.box_corner_to_center(anchors) c_assigned_bb = d2l.box_corner_to_center(assigned_bb) offset_xy = 10 * (c_assigned_bb[:, :2] - c_anc[:, :2]) / c_anc[:, 2:] offset_wh = 5 * d2l.log(eps + c_assigned_bb[:, 2:] / c_anc[:, 2:]) offset = d2l.concat([offset_xy, offset_wh], axis=1) return offset
[docs]def multibox_target(anchors, labels): """Label anchor boxes using ground-truth bounding boxes. Defined in :numref:`subsec_labeling-anchor-boxes`""" batch_size, anchors = labels.shape[0], anchors.squeeze(0) batch_offset, batch_mask, batch_class_labels = [], [], [] device, num_anchors = anchors.ctx, anchors.shape[0] for i in range(batch_size): label = labels[i, :, :] anchors_bbox_map = assign_anchor_to_bbox( label[:, 1:], anchors, device) bbox_mask = np.tile((np.expand_dims((anchors_bbox_map >= 0), axis=-1)), (1, 4)).astype('int32') # Initialize class labels and assigned bounding box coordinates with # zeros class_labels = d2l.zeros(num_anchors, dtype=np.int32, ctx=device) assigned_bb = d2l.zeros((num_anchors, 4), dtype=np.float32, ctx=device) # Label classes of anchor boxes using their assigned ground-truth # bounding boxes. If an anchor box is not assigned any, we label its # class as background (the value remains zero) indices_true = np.nonzero(anchors_bbox_map >= 0)[0] bb_idx = anchors_bbox_map[indices_true] class_labels[indices_true] = label[bb_idx, 0].astype('int32') + 1 assigned_bb[indices_true] = label[bb_idx, 1:] # Offset transformation offset = offset_boxes(anchors, assigned_bb) * bbox_mask batch_offset.append(offset.reshape(-1)) batch_mask.append(bbox_mask.reshape(-1)) batch_class_labels.append(class_labels) bbox_offset = d2l.stack(batch_offset) bbox_mask = d2l.stack(batch_mask) class_labels = d2l.stack(batch_class_labels) return (bbox_offset, bbox_mask, class_labels)
[docs]def offset_inverse(anchors, offset_preds): """Predict bounding boxes based on anchor boxes with predicted offsets. Defined in :numref:`subsec_labeling-anchor-boxes`""" anc = d2l.box_corner_to_center(anchors) pred_bbox_xy = (offset_preds[:, :2] * anc[:, 2:] / 10) + anc[:, :2] pred_bbox_wh = d2l.exp(offset_preds[:, 2:] / 5) * anc[:, 2:] pred_bbox = d2l.concat((pred_bbox_xy, pred_bbox_wh), axis=1) predicted_bbox = d2l.box_center_to_corner(pred_bbox) return predicted_bbox
[docs]def nms(boxes, scores, iou_threshold): """Sort confidence scores of predicted bounding boxes. Defined in :numref:`subsec_predicting-bounding-boxes-nms`""" B = scores.argsort()[::-1] keep = [] # Indices of predicted bounding boxes that will be kept while B.size > 0: i = B[0] keep.append(i) if B.size == 1: break iou = box_iou(boxes[i, :].reshape(-1, 4), boxes[B[1:], :].reshape(-1, 4)).reshape(-1) inds = np.nonzero(iou <= iou_threshold)[0] B = B[inds + 1] return np.array(keep, dtype=np.int32, ctx=boxes.ctx)
[docs]def multibox_detection(cls_probs, offset_preds, anchors, nms_threshold=0.5, pos_threshold=0.009999999): """Predict bounding boxes using non-maximum suppression. Defined in :numref:`subsec_predicting-bounding-boxes-nms`""" device, batch_size = cls_probs.ctx, cls_probs.shape[0] anchors = np.squeeze(anchors, axis=0) num_classes, num_anchors = cls_probs.shape[1], cls_probs.shape[2] out = [] for i in range(batch_size): cls_prob, offset_pred = cls_probs[i], offset_preds[i].reshape(-1, 4) conf, class_id = np.max(cls_prob[1:], 0), np.argmax(cls_prob[1:], 0) predicted_bb = offset_inverse(anchors, offset_pred) keep = nms(predicted_bb, conf, nms_threshold) # Find all non-`keep` indices and set the class to background all_idx = np.arange(num_anchors, dtype=np.int32, ctx=device) combined = d2l.concat((keep, all_idx)) unique, counts = np.unique(combined, return_counts=True) non_keep = unique[counts == 1] all_id_sorted = d2l.concat((keep, non_keep)) class_id[non_keep] = -1 class_id = class_id[all_id_sorted].astype('float32') conf, predicted_bb = conf[all_id_sorted], predicted_bb[all_id_sorted] # Here `pos_threshold` is a threshold for positive (non-background) # predictions below_min_idx = (conf < pos_threshold) class_id[below_min_idx] = -1 conf[below_min_idx] = 1 - conf[below_min_idx] pred_info = d2l.concat((np.expand_dims(class_id, axis=1), np.expand_dims(conf, axis=1), predicted_bb), axis=1) out.append(pred_info) return d2l.stack(out)
d2l.DATA_HUB['banana-detection'] = ( d2l.DATA_URL + '', '5de26c8fce5ccdea9f91267273464dc968d20d72')
[docs]def read_data_bananas(is_train=True): """Read the banana detection dataset images and labels. Defined in :numref:`sec_object-detection-dataset`""" data_dir = d2l.download_extract('banana-detection') csv_fname = os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'label.csv') csv_data = pd.read_csv(csv_fname) csv_data = csv_data.set_index('img_name') images, targets = [], [] for img_name, target in csv_data.iterrows(): images.append(image.imread( os.path.join(data_dir, 'bananas_train' if is_train else 'bananas_val', 'images', f'{img_name}'))) # Here `target` contains (class, upper-left x, upper-left y, # lower-right x, lower-right y), where all the images have the same # banana class (index 0) targets.append(list(target)) return images, np.expand_dims(np.array(targets), 1) / 256
[docs]class BananasDataset( """A customized dataset to load the banana detection dataset. Defined in :numref:`sec_object-detection-dataset`""" def __init__(self, is_train): self.features, self.labels = read_data_bananas(is_train) print('read ' + str(len(self.features)) + (f' training examples' if is_train else f' validation examples')) def __getitem__(self, idx): return (self.features[idx].astype('float32').transpose(2, 0, 1), self.labels[idx]) def __len__(self): return len(self.features)
[docs]def load_data_bananas(batch_size): """Load the banana detection dataset. Defined in :numref:`sec_object-detection-dataset`""" train_iter =, batch_size, shuffle=True) val_iter =, batch_size) return train_iter, val_iter
d2l.DATA_HUB['voc2012'] = (d2l.DATA_URL + 'VOCtrainval_11-May-2012.tar', '4e443f8a2eca6b1dac8a6c57641b67dd40621a49')
[docs]def read_voc_images(voc_dir, is_train=True): """Read all VOC feature and label images. Defined in :numref:`sec_semantic_segmentation`""" txt_fname = os.path.join(voc_dir, 'ImageSets', 'Segmentation', 'train.txt' if is_train else 'val.txt') with open(txt_fname, 'r') as f: images = features, labels = [], [] for i, fname in enumerate(images): features.append(image.imread(os.path.join( voc_dir, 'JPEGImages', f'{fname}.jpg'))) labels.append(image.imread(os.path.join( voc_dir, 'SegmentationClass', f'{fname}.png'))) return features, labels
VOC_COLORMAP = [[0, 0, 0], [128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128], [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0], [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128], [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]] VOC_CLASSES = ['background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike', 'person', 'potted plant', 'sheep', 'sofa', 'train', 'tv/monitor']
[docs]def voc_colormap2label(): """Build the mapping from RGB to class indices for VOC labels. Defined in :numref:`sec_semantic_segmentation`""" colormap2label = np.zeros(256 ** 3) for i, colormap in enumerate(VOC_COLORMAP): colormap2label[ (colormap[0] * 256 + colormap[1]) * 256 + colormap[2]] = i return colormap2label
[docs]def voc_label_indices(colormap, colormap2label): """Map any RGB values in VOC labels to their class indices. Defined in :numref:`sec_semantic_segmentation`""" colormap = colormap.astype(np.int32) idx = ((colormap[:, :, 0] * 256 + colormap[:, :, 1]) * 256 + colormap[:, :, 2]) return colormap2label[idx]
[docs]def voc_rand_crop(feature, label, height, width): """Randomly crop both feature and label images. Defined in :numref:`sec_semantic_segmentation`""" feature, rect = image.random_crop(feature, (width, height)) label = image.fixed_crop(label, *rect) return feature, label
[docs]class VOCSegDataset( """A customized dataset to load the VOC dataset. Defined in :numref:`sec_semantic_segmentation`""" def __init__(self, is_train, crop_size, voc_dir): self.rgb_mean = np.array([0.485, 0.456, 0.406]) self.rgb_std = np.array([0.229, 0.224, 0.225]) self.crop_size = crop_size features, labels = read_voc_images(voc_dir, is_train=is_train) self.features = [self.normalize_image(feature) for feature in self.filter(features)] self.labels = self.filter(labels) self.colormap2label = voc_colormap2label() print('read ' + str(len(self.features)) + ' examples')
[docs] def normalize_image(self, img): return (img.astype('float32') / 255 - self.rgb_mean) / self.rgb_std
[docs] def filter(self, imgs): return [img for img in imgs if ( img.shape[0] >= self.crop_size[0] and img.shape[1] >= self.crop_size[1])]
def __getitem__(self, idx): feature, label = voc_rand_crop(self.features[idx], self.labels[idx], *self.crop_size) return (feature.transpose(2, 0, 1), voc_label_indices(label, self.colormap2label)) def __len__(self): return len(self.features)
[docs]def load_data_voc(batch_size, crop_size): """Load the VOC semantic segmentation dataset. Defined in :numref:`sec_semantic_segmentation`""" voc_dir = d2l.download_extract('voc2012', os.path.join( 'VOCdevkit', 'VOC2012')) num_workers = d2l.get_dataloader_workers() train_iter = VOCSegDataset(True, crop_size, voc_dir), batch_size, shuffle=True, last_batch='discard', num_workers=num_workers) test_iter = VOCSegDataset(False, crop_size, voc_dir), batch_size, last_batch='discard', num_workers=num_workers) return train_iter, test_iter
d2l.DATA_HUB['cifar10_tiny'] = (d2l.DATA_URL + '', '2068874e4b9a9f0fb07ebe0ad2b29754449ccacd')
[docs]def read_csv_labels(fname): """Read `fname` to return a filename to label dictionary. Defined in :numref:`sec_kaggle_cifar10`""" with open(fname, 'r') as f: # Skip the file header line (column name) lines = f.readlines()[1:] tokens = [l.rstrip().split(',') for l in lines] return dict(((name, label) for name, label in tokens))
[docs]def copyfile(filename, target_dir): """Copy a file into a target directory. Defined in :numref:`sec_kaggle_cifar10`""" os.makedirs(target_dir, exist_ok=True) shutil.copy(filename, target_dir)
[docs]def reorg_train_valid(data_dir, labels, valid_ratio): """Split the validation set out of the original training set. Defined in :numref:`sec_kaggle_cifar10`""" # The number of examples of the class that has the fewest examples in the # training dataset n = collections.Counter(labels.values()).most_common()[-1][1] # The number of examples per class for the validation set n_valid_per_label = max(1, math.floor(n * valid_ratio)) label_count = {} for train_file in os.listdir(os.path.join(data_dir, 'train')): label = labels[train_file.split('.')[0]] fname = os.path.join(data_dir, 'train', train_file) copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'train_valid', label)) if label not in label_count or label_count[label] < n_valid_per_label: copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'valid', label)) label_count[label] = label_count.get(label, 0) + 1 else: copyfile(fname, os.path.join(data_dir, 'train_valid_test', 'train', label)) return n_valid_per_label
[docs]def reorg_test(data_dir): """Organize the testing set for data loading during prediction. Defined in :numref:`sec_kaggle_cifar10`""" for test_file in os.listdir(os.path.join(data_dir, 'test')): copyfile(os.path.join(data_dir, 'test', test_file), os.path.join(data_dir, 'train_valid_test', 'test', 'unknown'))
d2l.DATA_HUB['dog_tiny'] = (d2l.DATA_URL + '', '0cb91d09b814ecdc07b50f31f8dcad3e81d6a86d') d2l.DATA_HUB['ptb'] = (d2l.DATA_URL + '', '319d85e578af0cdc590547f26231e4e31cdf1e42')
[docs]def read_ptb(): """Load the PTB dataset into a list of text lines. Defined in :numref:`sec_word2vec_data`""" data_dir = d2l.download_extract('ptb') # Read the training set. with open(os.path.join(data_dir, 'ptb.train.txt')) as f: raw_text = return [line.split() for line in raw_text.split('\n')]
[docs]def subsample(sentences, vocab): """Subsample high-frequency words. Defined in :numref:`sec_word2vec_data`""" # Exclude unknown tokens '<unk>' sentences = [[token for token in line if vocab[token] != vocab.unk] for line in sentences] counter = d2l.count_corpus(sentences) num_tokens = sum(counter.values()) # Return True if `token` is kept during subsampling def keep(token): return(random.uniform(0, 1) < math.sqrt(1e-4 / counter[token] * num_tokens)) return ([[token for token in line if keep(token)] for line in sentences], counter)
[docs]def get_centers_and_contexts(corpus, max_window_size): """Return center words and context words in skip-gram. Defined in :numref:`sec_word2vec_data`""" centers, contexts = [], [] for line in corpus: # To form a "center word--context word" pair, each sentence needs to # have at least 2 words if len(line) < 2: continue centers += line for i in range(len(line)): # Context window centered at `i` window_size = random.randint(1, max_window_size) indices = list(range(max(0, i - window_size), min(len(line), i + 1 + window_size))) # Exclude the center word from the context words indices.remove(i) contexts.append([line[idx] for idx in indices]) return centers, contexts
[docs]class RandomGenerator: """Randomly draw among {1, ..., n} according to n sampling weights.""" def __init__(self, sampling_weights): """Defined in :numref:`sec_word2vec_data`""" # Exclude self.population = list(range(1, len(sampling_weights) + 1)) self.sampling_weights = sampling_weights self.candidates = [] self.i = 0
[docs] def draw(self): if self.i == len(self.candidates): # Cache `k` random sampling results self.candidates = random.choices( self.population, self.sampling_weights, k=10000) self.i = 0 self.i += 1 return self.candidates[self.i - 1]
[docs]def get_negatives(all_contexts, vocab, counter, K): """Return noise words in negative sampling. Defined in :numref:`sec_word2vec_data`""" # Sampling weights for words with indices 1, 2, ... (index 0 is the # excluded unknown token) in the vocabulary sampling_weights = [counter[vocab.to_tokens(i)]**0.75 for i in range(1, len(vocab))] all_negatives, generator = [], RandomGenerator(sampling_weights) for contexts in all_contexts: negatives = [] while len(negatives) < len(contexts) * K: neg = generator.draw() # Noise words cannot be context words if neg not in contexts: negatives.append(neg) all_negatives.append(negatives) return all_negatives
[docs]def batchify(data): """Return a minibatch of examples for skip-gram with negative sampling. Defined in :numref:`sec_word2vec_data`""" max_len = max(len(c) + len(n) for _, c, n in data) centers, contexts_negatives, masks, labels = [], [], [], [] for center, context, negative in data: cur_len = len(context) + len(negative) centers += [center] contexts_negatives += [context + negative + [0] * (max_len - cur_len)] masks += [[1] * cur_len + [0] * (max_len - cur_len)] labels += [[1] * len(context) + [0] * (max_len - len(context))] return (d2l.reshape(d2l.tensor(centers), (-1, 1)), d2l.tensor( contexts_negatives), d2l.tensor(masks), d2l.tensor(labels))
[docs]def load_data_ptb(batch_size, max_window_size, num_noise_words): """Download the PTB dataset and then load it into memory. Defined in :numref:`subsec_word2vec-minibatch-loading`""" sentences = read_ptb() vocab = d2l.Vocab(sentences, min_freq=10) subsampled, counter = subsample(sentences, vocab) corpus = [vocab[line] for line in subsampled] all_centers, all_contexts = get_centers_and_contexts( corpus, max_window_size) all_negatives = get_negatives( all_contexts, vocab, counter, num_noise_words) dataset = all_centers, all_contexts, all_negatives) data_iter = dataset, batch_size, shuffle=True,batchify_fn=batchify, num_workers=d2l.get_dataloader_workers()) return data_iter, vocab
d2l.DATA_HUB['glove.6b.50d'] = (d2l.DATA_URL + '', '0b8703943ccdb6eb788e6f091b8946e82231bc4d') d2l.DATA_HUB['glove.6b.100d'] = (d2l.DATA_URL + '', 'cd43bfb07e44e6f27cbcc7bc9ae3d80284fdaf5a') d2l.DATA_HUB['glove.42b.300d'] = (d2l.DATA_URL + '', 'b5116e234e9eb9076672cfeabf5469f3eec904fa') d2l.DATA_HUB['wiki.en'] = (d2l.DATA_URL + '', 'c1816da3821ae9f43899be655002f6c723e91b88')
[docs]class TokenEmbedding: """Token Embedding.""" def __init__(self, embedding_name): """Defined in :numref:`sec_synonyms`""" self.idx_to_token, self.idx_to_vec = self._load_embedding( embedding_name) self.unknown_idx = 0 self.token_to_idx = {token: idx for idx, token in enumerate(self.idx_to_token)} def _load_embedding(self, embedding_name): idx_to_token, idx_to_vec = ['<unk>'], [] data_dir = d2l.download_extract(embedding_name) # GloVe website: # fastText website: with open(os.path.join(data_dir, 'vec.txt'), 'r') as f: for line in f: elems = line.rstrip().split(' ') token, elems = elems[0], [float(elem) for elem in elems[1:]] # Skip header information, such as the top row in fastText if len(elems) > 1: idx_to_token.append(token) idx_to_vec.append(elems) idx_to_vec = [[0] * len(idx_to_vec[0])] + idx_to_vec return idx_to_token, d2l.tensor(idx_to_vec) def __getitem__(self, tokens): indices = [self.token_to_idx.get(token, self.unknown_idx) for token in tokens] vecs = self.idx_to_vec[d2l.tensor(indices)] return vecs def __len__(self): return len(self.idx_to_token)
[docs]def get_tokens_and_segments(tokens_a, tokens_b=None): """Get tokens of the BERT input sequence and their segment IDs. Defined in :numref:`sec_bert`""" tokens = ['<cls>'] + tokens_a + ['<sep>'] # 0 and 1 are marking segment A and B, respectively segments = [0] * (len(tokens_a) + 2) if tokens_b is not None: tokens += tokens_b + ['<sep>'] segments += [1] * (len(tokens_b) + 1) return tokens, segments
[docs]class BERTEncoder(nn.Block): """BERT encoder. Defined in :numref:`subsec_bert_input_rep`""" def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len=1000, **kwargs): super(BERTEncoder, self).__init__(**kwargs) self.token_embedding = nn.Embedding(vocab_size, num_hiddens) self.segment_embedding = nn.Embedding(2, num_hiddens) self.blks = nn.Sequential() for _ in range(num_layers): self.blks.add(d2l.EncoderBlock( num_hiddens, ffn_num_hiddens, num_heads, dropout, True)) # In BERT, positional embeddings are learnable, thus we create a # parameter of positional embeddings that are long enough self.pos_embedding = self.params.get('pos_embedding', shape=(1, max_len, num_hiddens))
[docs] def forward(self, tokens, segments, valid_lens): # Shape of `X` remains unchanged in the following code snippet: # (batch size, max sequence length, `num_hiddens`) X = self.token_embedding(tokens) + self.segment_embedding(segments) X = X +[:, :X.shape[1], :] for blk in self.blks: X = blk(X, valid_lens) return X
[docs]class MaskLM(nn.Block): """The masked language model task of BERT. Defined in :numref:`subsec_bert_input_rep`""" def __init__(self, vocab_size, num_hiddens, **kwargs): super(MaskLM, self).__init__(**kwargs) self.mlp = nn.Sequential() self.mlp.add( nn.Dense(num_hiddens, flatten=False, activation='relu')) self.mlp.add(nn.LayerNorm()) self.mlp.add(nn.Dense(vocab_size, flatten=False))
[docs] def forward(self, X, pred_positions): num_pred_positions = pred_positions.shape[1] pred_positions = pred_positions.reshape(-1) batch_size = X.shape[0] batch_idx = np.arange(0, batch_size) # Suppose that `batch_size` = 2, `num_pred_positions` = 3, then # `batch_idx` is `np.array([0, 0, 0, 1, 1, 1])` batch_idx = np.repeat(batch_idx, num_pred_positions) masked_X = X[batch_idx, pred_positions] masked_X = masked_X.reshape((batch_size, num_pred_positions, -1)) mlm_Y_hat = self.mlp(masked_X) return mlm_Y_hat
[docs]class NextSentencePred(nn.Block): """The next sentence prediction task of BERT. Defined in :numref:`subsec_mlm`""" def __init__(self, **kwargs): super(NextSentencePred, self).__init__(**kwargs) self.output = nn.Dense(2)
[docs] def forward(self, X): # `X` shape: (batch size, `num_hiddens`) return self.output(X)
[docs]class BERTModel(nn.Block): """The BERT model. Defined in :numref:`subsec_nsp`""" def __init__(self, vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len=1000): super(BERTModel, self).__init__() self.encoder = BERTEncoder(vocab_size, num_hiddens, ffn_num_hiddens, num_heads, num_layers, dropout, max_len) self.hidden = nn.Dense(num_hiddens, activation='tanh') self.mlm = MaskLM(vocab_size, num_hiddens) self.nsp = NextSentencePred()
[docs] def forward(self, tokens, segments, valid_lens=None, pred_positions=None): encoded_X = self.encoder(tokens, segments, valid_lens) if pred_positions is not None: mlm_Y_hat = self.mlm(encoded_X, pred_positions) else: mlm_Y_hat = None # The hidden layer of the MLP classifier for next sentence prediction. # 0 is the index of the '<cls>' token nsp_Y_hat = self.nsp(self.hidden(encoded_X[:, 0, :])) return encoded_X, mlm_Y_hat, nsp_Y_hat
d2l.DATA_HUB['wikitext-2'] = ( '' '', '3c914d17d80b1459be871a5039ac23e752a53cbe') def _read_wiki(data_dir): """Defined in :numref:`sec_bert-dataset`""" file_name = os.path.join(data_dir, 'wiki.train.tokens') with open(file_name, 'r') as f: lines = f.readlines() # Uppercase letters are converted to lowercase ones paragraphs = [line.strip().lower().split(' . ') for line in lines if len(line.split(' . ')) >= 2] random.shuffle(paragraphs) return paragraphs def _get_next_sentence(sentence, next_sentence, paragraphs): """Defined in :numref:`sec_bert-dataset`""" if random.random() < 0.5: is_next = True else: # `paragraphs` is a list of lists of lists next_sentence = random.choice(random.choice(paragraphs)) is_next = False return sentence, next_sentence, is_next def _get_nsp_data_from_paragraph(paragraph, paragraphs, vocab, max_len): """Defined in :numref:`sec_bert-dataset`""" nsp_data_from_paragraph = [] for i in range(len(paragraph) - 1): tokens_a, tokens_b, is_next = _get_next_sentence( paragraph[i], paragraph[i + 1], paragraphs) # Consider 1 '<cls>' token and 2 '<sep>' tokens if len(tokens_a) + len(tokens_b) + 3 > max_len: continue tokens, segments = d2l.get_tokens_and_segments(tokens_a, tokens_b) nsp_data_from_paragraph.append((tokens, segments, is_next)) return nsp_data_from_paragraph def _replace_mlm_tokens(tokens, candidate_pred_positions, num_mlm_preds, vocab): """Defined in :numref:`sec_bert-dataset`""" # Make a new copy of tokens for the input of a masked language model, # where the input may contain replaced '<mask>' or random tokens mlm_input_tokens = [token for token in tokens] pred_positions_and_labels = [] # Shuffle for getting 15% random tokens for prediction in the masked # language modeling task random.shuffle(candidate_pred_positions) for mlm_pred_position in candidate_pred_positions: if len(pred_positions_and_labels) >= num_mlm_preds: break masked_token = None # 80% of the time: replace the word with the '<mask>' token if random.random() < 0.8: masked_token = '<mask>' else: # 10% of the time: keep the word unchanged if random.random() < 0.5: masked_token = tokens[mlm_pred_position] # 10% of the time: replace the word with a random word else: masked_token = random.choice(vocab.idx_to_token) mlm_input_tokens[mlm_pred_position] = masked_token pred_positions_and_labels.append( (mlm_pred_position, tokens[mlm_pred_position])) return mlm_input_tokens, pred_positions_and_labels def _get_mlm_data_from_tokens(tokens, vocab): """Defined in :numref:`subsec_prepare_mlm_data`""" candidate_pred_positions = [] # `tokens` is a list of strings for i, token in enumerate(tokens): # Special tokens are not predicted in the masked language modeling # task if token in ['<cls>', '<sep>']: continue candidate_pred_positions.append(i) # 15% of random tokens are predicted in the masked language modeling task num_mlm_preds = max(1, round(len(tokens) * 0.15)) mlm_input_tokens, pred_positions_and_labels = _replace_mlm_tokens( tokens, candidate_pred_positions, num_mlm_preds, vocab) pred_positions_and_labels = sorted(pred_positions_and_labels, key=lambda x: x[0]) pred_positions = [v[0] for v in pred_positions_and_labels] mlm_pred_labels = [v[1] for v in pred_positions_and_labels] return vocab[mlm_input_tokens], pred_positions, vocab[mlm_pred_labels] def _pad_bert_inputs(examples, max_len, vocab): """Defined in :numref:`subsec_prepare_mlm_data`""" max_num_mlm_preds = round(max_len * 0.15) all_token_ids, all_segments, valid_lens, = [], [], [] all_pred_positions, all_mlm_weights, all_mlm_labels = [], [], [] nsp_labels = [] for (token_ids, pred_positions, mlm_pred_label_ids, segments, is_next) in examples: all_token_ids.append(np.array(token_ids + [vocab['<pad>']] * ( max_len - len(token_ids)), dtype='int32')) all_segments.append(np.array(segments + [0] * ( max_len - len(segments)), dtype='int32')) # `valid_lens` excludes count of '<pad>' tokens valid_lens.append(np.array(len(token_ids), dtype='float32')) all_pred_positions.append(np.array(pred_positions + [0] * ( max_num_mlm_preds - len(pred_positions)), dtype='int32')) # Predictions of padded tokens will be filtered out in the loss via # multiplication of 0 weights all_mlm_weights.append( np.array([1.0] * len(mlm_pred_label_ids) + [0.0] * ( max_num_mlm_preds - len(pred_positions)), dtype='float32')) all_mlm_labels.append(np.array(mlm_pred_label_ids + [0] * ( max_num_mlm_preds - len(mlm_pred_label_ids)), dtype='int32')) nsp_labels.append(np.array(is_next)) return (all_token_ids, all_segments, valid_lens, all_pred_positions, all_mlm_weights, all_mlm_labels, nsp_labels) class _WikiTextDataset( """Defined in :numref:`subsec_prepare_mlm_data`""" def __init__(self, paragraphs, max_len): # Input `paragraphs[i]` is a list of sentence strings representing a # paragraph; while output `paragraphs[i]` is a list of sentences # representing a paragraph, where each sentence is a list of tokens paragraphs = [d2l.tokenize( paragraph, token='word') for paragraph in paragraphs] sentences = [sentence for paragraph in paragraphs for sentence in paragraph] self.vocab = d2l.Vocab(sentences, min_freq=5, reserved_tokens=[ '<pad>', '<mask>', '<cls>', '<sep>']) # Get data for the next sentence prediction task examples = [] for paragraph in paragraphs: examples.extend(_get_nsp_data_from_paragraph( paragraph, paragraphs, self.vocab, max_len)) # Get data for the masked language model task examples = [(_get_mlm_data_from_tokens(tokens, self.vocab) + (segments, is_next)) for tokens, segments, is_next in examples] # Pad inputs (self.all_token_ids, self.all_segments, self.valid_lens, self.all_pred_positions, self.all_mlm_weights, self.all_mlm_labels, self.nsp_labels) = _pad_bert_inputs( examples, max_len, self.vocab) def __getitem__(self, idx): return (self.all_token_ids[idx], self.all_segments[idx], self.valid_lens[idx], self.all_pred_positions[idx], self.all_mlm_weights[idx], self.all_mlm_labels[idx], self.nsp_labels[idx]) def __len__(self): return len(self.all_token_ids)
[docs]def load_data_wiki(batch_size, max_len): """Load the WikiText-2 dataset. Defined in :numref:`subsec_prepare_mlm_data`""" num_workers = d2l.get_dataloader_workers() data_dir = d2l.download_extract('wikitext-2', 'wikitext-2') paragraphs = _read_wiki(data_dir) train_set = _WikiTextDataset(paragraphs, max_len) train_iter =, batch_size, shuffle=True, num_workers=num_workers) return train_iter, train_set.vocab
def _get_batch_loss_bert(net, loss, vocab_size, tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards): """Defined in :numref:`sec_bert-pretraining`""" mlm_ls, nsp_ls, ls = [], [], [] for (tokens_X_shard, segments_X_shard, valid_lens_x_shard, pred_positions_X_shard, mlm_weights_X_shard, mlm_Y_shard, nsp_y_shard) in zip( tokens_X_shards, segments_X_shards, valid_lens_x_shards, pred_positions_X_shards, mlm_weights_X_shards, mlm_Y_shards, nsp_y_shards): # Forward pass _, mlm_Y_hat, nsp_Y_hat = net( tokens_X_shard, segments_X_shard, valid_lens_x_shard.reshape(-1), pred_positions_X_shard) # Compute masked language model loss mlm_l = loss( mlm_Y_hat.reshape((-1, vocab_size)), mlm_Y_shard.reshape(-1), mlm_weights_X_shard.reshape((-1, 1))) mlm_l = mlm_l.sum() / (mlm_weights_X_shard.sum() + 1e-8) # Compute next sentence prediction loss nsp_l = loss(nsp_Y_hat, nsp_y_shard) nsp_l = nsp_l.mean() mlm_ls.append(mlm_l) nsp_ls.append(nsp_l) ls.append(mlm_l + nsp_l) npx.waitall() return mlm_ls, nsp_ls, ls d2l.DATA_HUB['aclImdb'] = ( '', '01ada507287d82875905620988597833ad4e0903')
[docs]def read_imdb(data_dir, is_train): """Read the IMDb review dataset text sequences and labels. Defined in :numref:`sec_sentiment`""" data, labels = [], [] for label in ('pos', 'neg'): folder_name = os.path.join(data_dir, 'train' if is_train else 'test', label) for file in os.listdir(folder_name): with open(os.path.join(folder_name, file), 'rb') as f: review ='utf-8').replace('\n', '') data.append(review) labels.append(1 if label == 'pos' else 0) return data, labels
[docs]def load_data_imdb(batch_size, num_steps=500): """Return data iterators and the vocabulary of the IMDb review dataset. Defined in :numref:`sec_sentiment`""" data_dir = d2l.download_extract('aclImdb', 'aclImdb') train_data = read_imdb(data_dir, True) test_data = read_imdb(data_dir, False) train_tokens = d2l.tokenize(train_data[0], token='word') test_tokens = d2l.tokenize(test_data[0], token='word') vocab = d2l.Vocab(train_tokens, min_freq=5) train_features = np.array([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in train_tokens]) test_features = np.array([d2l.truncate_pad( vocab[line], num_steps, vocab['<pad>']) for line in test_tokens]) train_iter = d2l.load_array((train_features, train_data[1]), batch_size) test_iter = d2l.load_array((test_features, test_data[1]), batch_size, is_train=False) return train_iter, test_iter, vocab
[docs]def predict_sentiment(net, vocab, sequence): """Predict the sentiment of a text sequence. Defined in :numref:`sec_sentiment_rnn`""" sequence = np.array(vocab[sequence.split()], ctx=d2l.try_gpu()) label = np.argmax(net(sequence.reshape(1, -1)), axis=1) return 'positive' if label == 1 else 'negative'
d2l.DATA_HUB['SNLI'] = ( '', '9fcde07509c7e87ec61c640c1b2753d9041758e4')
[docs]def read_snli(data_dir, is_train): """Read the SNLI dataset into premises, hypotheses, and labels. Defined in :numref:`sec_natural-language-inference-and-dataset`""" def extract_text(s): # Remove information that will not be used by us s = re.sub('\\(', '', s) s = re.sub('\\)', '', s) # Substitute two or more consecutive whitespace with space s = re.sub('\\s{2,}', ' ', s) return s.strip() label_set = {'entailment': 0, 'contradiction': 1, 'neutral': 2} file_name = os.path.join(data_dir, 'snli_1.0_train.txt' if is_train else 'snli_1.0_test.txt') with open(file_name, 'r') as f: rows = [row.split('\t') for row in f.readlines()[1:]] premises = [extract_text(row[1]) for row in rows if row[0] in label_set] hypotheses = [extract_text(row[2]) for row in rows if row[0] in label_set] labels = [label_set[row[0]] for row in rows if row[0] in label_set] return premises, hypotheses, labels
[docs]class SNLIDataset( """A customized dataset to load the SNLI dataset. Defined in :numref:`sec_natural-language-inference-and-dataset`""" def __init__(self, dataset, num_steps, vocab=None): self.num_steps = num_steps all_premise_tokens = d2l.tokenize(dataset[0]) all_hypothesis_tokens = d2l.tokenize(dataset[1]) if vocab is None: self.vocab = d2l.Vocab(all_premise_tokens + all_hypothesis_tokens, min_freq=5, reserved_tokens=['<pad>']) else: self.vocab = vocab self.premises = self._pad(all_premise_tokens) self.hypotheses = self._pad(all_hypothesis_tokens) self.labels = np.array(dataset[2]) print('read ' + str(len(self.premises)) + ' examples') def _pad(self, lines): return np.array([d2l.truncate_pad( self.vocab[line], self.num_steps, self.vocab['<pad>']) for line in lines]) def __getitem__(self, idx): return (self.premises[idx], self.hypotheses[idx]), self.labels[idx] def __len__(self): return len(self.premises)
[docs]def load_data_snli(batch_size, num_steps=50): """Download the SNLI dataset and return data iterators and vocabulary. Defined in :numref:`sec_natural-language-inference-and-dataset`""" num_workers = d2l.get_dataloader_workers() data_dir = d2l.download_extract('SNLI') train_data = read_snli(data_dir, True) test_data = read_snli(data_dir, False) train_set = SNLIDataset(train_data, num_steps) test_set = SNLIDataset(test_data, num_steps, train_set.vocab) train_iter =, batch_size, shuffle=True, num_workers=num_workers) test_iter =, batch_size, shuffle=False, num_workers=num_workers) return train_iter, test_iter, train_set.vocab
[docs]def split_batch_multi_inputs(X, y, devices): """Split multi-input `X` and `y` into multiple devices. Defined in :numref:`sec_natural-language-inference-attention`""" X = list(zip(*[gluon.utils.split_and_load( feature, devices, even_split=False) for feature in X])) return (X, gluon.utils.split_and_load(y, devices, even_split=False))
[docs]def predict_snli(net, vocab, premise, hypothesis): """Predict the logical relationship between the premise and hypothesis. Defined in :numref:`sec_natural-language-inference-attention`""" premise = np.array(vocab[premise], ctx=d2l.try_gpu()) hypothesis = np.array(vocab[hypothesis], ctx=d2l.try_gpu()) label = np.argmax(net([premise.reshape((1, -1)), hypothesis.reshape((1, -1))]), axis=1) return 'entailment' if label == 0 else 'contradiction' if label == 1 \ else 'neutral'
d2l.DATA_HUB['ml-100k'] = ( '', 'cd4dcac4241c8a4ad7badc7ca635da8a69dddb83')
[docs]def read_data_ml100k(): data_dir = d2l.download_extract('ml-100k') names = ['user_id', 'item_id', 'rating', 'timestamp'] data = pd.read_csv(os.path.join(data_dir, ''), '\t', names=names, engine='python') num_users = data.user_id.unique().shape[0] num_items = data.item_id.unique().shape[0] return data, num_users, num_items
[docs]def split_data_ml100k(data, num_users, num_items, split_mode='random', test_ratio=0.1): """Split the dataset in random mode or seq-aware mode.""" if split_mode == 'seq-aware': train_items, test_items, train_list = {}, {}, [] for line in data.itertuples(): u, i, rating, time = line[1], line[2], line[3], line[4] train_items.setdefault(u, []).append((u, i, rating, time)) if u not in test_items or test_items[u][-1] < time: test_items[u] = (i, rating, time) for u in range(1, num_users + 1): train_list.extend(sorted(train_items[u], key=lambda k: k[3])) test_data = [(key, *value) for key, value in test_items.items()] train_data = [item for item in train_list if item not in test_data] train_data = pd.DataFrame(train_data) test_data = pd.DataFrame(test_data) else: mask = [True if x == 1 else False for x in np.random.uniform( 0, 1, (len(data))) < 1 - test_ratio] neg_mask = [not x for x in mask] train_data, test_data = data[mask], data[neg_mask] return train_data, test_data
[docs]def load_data_ml100k(data, num_users, num_items, feedback='explicit'): users, items, scores = [], [], [] inter = np.zeros((num_items, num_users)) if feedback == 'explicit' else {} for line in data.itertuples(): user_index, item_index = int(line[1] - 1), int(line[2] - 1) score = int(line[3]) if feedback == 'explicit' else 1 users.append(user_index) items.append(item_index) scores.append(score) if feedback == 'implicit': inter.setdefault(user_index, []).append(item_index) else: inter[item_index, user_index] = score return users, items, scores, inter
[docs]def split_and_load_ml100k(split_mode='seq-aware', feedback='explicit', test_ratio=0.1, batch_size=256): data, num_users, num_items = read_data_ml100k() train_data, test_data = split_data_ml100k( data, num_users, num_items, split_mode, test_ratio) train_u, train_i, train_r, _ = load_data_ml100k( train_data, num_users, num_items, feedback) test_u, test_i, test_r, _ = load_data_ml100k( test_data, num_users, num_items, feedback) train_set = np.array(train_u), np.array(train_i), np.array(train_r)) test_set = np.array(test_u), np.array(test_i), np.array(test_r)) train_iter = train_set, shuffle=True, last_batch='rollover', batch_size=batch_size) test_iter = test_set, batch_size=batch_size) return num_users, num_items, train_iter, test_iter
[docs]def train_recsys_rating(net, train_iter, test_iter, loss, trainer, num_epochs, devices=d2l.try_all_gpus(), evaluator=None, **kwargs): timer = d2l.Timer() animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 2], legend=['train loss', 'test RMSE']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): timer.start() input_data = [] values = values if isinstance(values, list) else [values] for v in values: input_data.append(gluon.utils.split_and_load(v, devices)) train_feat = input_data[0:-1] if len(values) > 1 else input_data train_label = input_data[-1] with autograd.record(): preds = [net(*t) for t in zip(*train_feat)] ls = [loss(p, s) for p, s in zip(preds, train_label)] [l.backward() for l in ls] l += sum([l.asnumpy() for l in ls]).mean() / len(devices) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() if len(kwargs) > 0: # It will be used in section AutoRec test_rmse = evaluator(net, test_iter, kwargs['inter_mat'], devices) else: test_rmse = evaluator(net, test_iter, devices) train_l = l / (i + 1) animator.add(epoch + 1, (train_l, test_rmse)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test RMSE {test_rmse:.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}')
[docs]class BPRLoss(gluon.loss.Loss): def __init__(self, weight=None, batch_axis=0, **kwargs): super(BPRLoss, self).__init__(weight=None, batch_axis=0, **kwargs)
[docs] def forward(self, positive, negative): distances = positive - negative loss = - np.sum(np.log(npx.sigmoid(distances)), 0, keepdims=True) return loss
[docs]class HingeLossbRec(gluon.loss.Loss): def __init__(self, weight=None, batch_axis=0, **kwargs): super(HingeLossbRec, self).__init__(weight=None, batch_axis=0, **kwargs)
[docs] def forward(self, positive, negative, margin=1): distances = positive - negative loss = np.sum(np.maximum(- distances + margin, 0)) return loss
[docs]def hit_and_auc(rankedlist, test_matrix, k): hits_k = [(idx, val) for idx, val in enumerate(rankedlist[:k]) if val in set(test_matrix)] hits_all = [(idx, val) for idx, val in enumerate(rankedlist) if val in set(test_matrix)] max = len(rankedlist) - 1 auc = 1.0 * (max - hits_all[0][0]) / max if len(hits_all) > 0 else 0 return len(hits_k), auc
[docs]def evaluate_ranking(net, test_input, seq, candidates, num_users, num_items, devices): ranked_list, ranked_items, hit_rate, auc = {}, {}, [], [] all_items = set([i for i in range(num_users)]) for u in range(num_users): neg_items = list(all_items - set(candidates[int(u)])) user_ids, item_ids, x, scores = [], [], [], [] [item_ids.append(i) for i in neg_items] [user_ids.append(u) for _ in neg_items] x.extend([np.array(user_ids)]) if seq is not None: x.append(seq[user_ids, :]) x.extend([np.array(item_ids)]) test_data_iter =*x), shuffle=False, last_batch="keep", batch_size=1024) for index, values in enumerate(test_data_iter): x = [gluon.utils.split_and_load(v, devices, even_split=False) for v in values] scores.extend([list(net(*t).asnumpy()) for t in zip(*x)]) scores = [item for sublist in scores for item in sublist] item_scores = list(zip(item_ids, scores)) ranked_list[u] = sorted(item_scores, key=lambda t: t[1], reverse=True) ranked_items[u] = [r[0] for r in ranked_list[u]] temp = hit_and_auc(ranked_items[u], test_input[u], 50) hit_rate.append(temp[0]) auc.append(temp[1]) return np.mean(np.array(hit_rate)), np.mean(np.array(auc))
[docs]def train_ranking(net, train_iter, test_iter, loss, trainer, test_seq_iter, num_users, num_items, num_epochs, devices, evaluator, candidates, eval_step=1): timer, hit_rate, auc = d2l.Timer(), 0, 0 animator = d2l.Animator(xlabel='epoch', xlim=[1, num_epochs], ylim=[0, 1], legend=['test hit rate', 'test AUC']) for epoch in range(num_epochs): metric, l = d2l.Accumulator(3), 0. for i, values in enumerate(train_iter): input_data = [] for v in values: input_data.append(gluon.utils.split_and_load(v, devices)) with autograd.record(): p_pos = [net(*t) for t in zip(*input_data[0:-1])] p_neg = [net(*t) for t in zip(*input_data[0:-2], input_data[-1])] ls = [loss(p, n) for p, n in zip(p_pos, p_neg)] [l.backward(retain_graph=False) for l in ls] l += sum([l.asnumpy() for l in ls]).mean()/len(devices) trainer.step(values[0].shape[0]) metric.add(l, values[0].shape[0], values[0].size) timer.stop() with autograd.predict_mode(): if (epoch + 1) % eval_step == 0: hit_rate, auc = evaluator(net, test_iter, test_seq_iter, candidates, num_users, num_items, devices) animator.add(epoch + 1, (hit_rate, auc)) print(f'train loss {metric[0] / metric[1]:.3f}, ' f'test hit rate {float(hit_rate):.3f}, test AUC {float(auc):.3f}') print(f'{metric[2] * num_epochs / timer.sum():.1f} examples/sec ' f'on {str(devices)}')
d2l.DATA_HUB['ctr'] = (d2l.DATA_URL + '', 'e18327c48c8e8e5c23da714dd614e390d369843f')
[docs]class CTRDataset( def __init__(self, data_path, feat_mapper=None, defaults=None, min_threshold=4, num_feat=34): self.NUM_FEATS, self.count, = num_feat, 0, {} feat_cnts = defaultdict(lambda: defaultdict(int)) self.feat_mapper, self.defaults = feat_mapper, defaults self.field_dims = np.zeros(self.NUM_FEATS, dtype=np.int64) with open(data_path) as f: for line in f: instance = {} values = line.rstrip('\n').split('\t') if len(values) != self.NUM_FEATS + 1: continue label = np.float32([0, 0]) label[int(values[0])] = 1 instance['y'] = [np.float32(values[0])] for i in range(1, self.NUM_FEATS + 1): feat_cnts[i][values[i]] += 1 instance.setdefault('x', []).append(values[i])[self.count] = instance self.count = self.count + 1 if self.feat_mapper is None and self.defaults is None: feat_mapper = {i: {feat for feat, c in cnt.items() if c >= min_threshold} for i, cnt in feat_cnts.items()} self.feat_mapper = {i: {feat_v: idx for idx, feat_v in enumerate(feat_values)} for i, feat_values in feat_mapper.items()} self.defaults = {i: len(feat_values) for i, feat_values in feat_mapper.items()} for i, fm in self.feat_mapper.items(): self.field_dims[i - 1] = len(fm) + 1 self.offsets = np.array((0, *np.cumsum(self.field_dims).asnumpy() [:-1])) def __len__(self): return self.count def __getitem__(self, idx): feat = np.array([self.feat_mapper[i + 1].get(v, self.defaults[i + 1]) for i, v in enumerate([idx]['x'])]) return feat + self.offsets,[idx]['y']
[docs]def update_D(X, Z, net_D, net_G, loss, trainer_D): """Update discriminator. Defined in :numref:`sec_basic_gan`""" batch_size = X.shape[0] ones = np.ones((batch_size,), ctx=X.ctx) zeros = np.zeros((batch_size,), ctx=X.ctx) with autograd.record(): real_Y = net_D(X) fake_X = net_G(Z) # Do not need to compute gradient for `net_G`, detach it from # computing gradients. fake_Y = net_D(fake_X.detach()) loss_D = (loss(real_Y, ones) + loss(fake_Y, zeros)) / 2 loss_D.backward() trainer_D.step(batch_size) return float(loss_D.sum())
[docs]def update_G(Z, net_D, net_G, loss, trainer_G): """Update generator. Defined in :numref:`sec_basic_gan`""" batch_size = Z.shape[0] ones = np.ones((batch_size,), ctx=Z.ctx) with autograd.record(): # We could reuse `fake_X` from `update_D` to save computation fake_X = net_G(Z) # Recomputing `fake_Y` is needed since `net_D` is changed fake_Y = net_D(fake_X) loss_G = loss(fake_Y, ones) loss_G.backward() trainer_G.step(batch_size) return float(loss_G.sum())
d2l.DATA_HUB['pokemon'] = (d2l.DATA_URL + '', 'c065c0e2593b8b161a2d7873e42418bf6a21106c')# Alias defined in config.ini size = lambda a: a.size transpose = lambda a: a.T ones = np.ones zeros = np.zeros arange = np.arange meshgrid = np.meshgrid sin = np.sin sinh = np.sinh cos = np.cos cosh = np.cosh tanh = np.tanh linspace = np.linspace exp = np.exp log = np.log tensor = np.array normal = np.random.normal rand = np.random.rand matmul = int32 = np.int32 float32 = np.float32 concat = np.concatenate stack = np.stack abs = np.abs eye = np.eye numpy = lambda x, *args, **kwargs: x.asnumpy(*args, **kwargs) reshape = lambda x, *args, **kwargs: x.reshape(*args, **kwargs) to = lambda x, *args, **kwargs: x.as_in_context(*args, **kwargs) reduce_sum = lambda x, *args, **kwargs: x.sum(*args, **kwargs) argmax = lambda x, *args, **kwargs: x.argmax(*args, **kwargs) astype = lambda x, *args, **kwargs: x.astype(*args, **kwargs)