.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python from mxnet import init, np, npx from mxnet.gluon import nn npx.set_np() net = nn.Sequential() net.add(nn.Dense(8, activation='relu')) net.add(nn.Dense(1)) net.initialize() # Use the default initialization method X = np.random.uniform(size=(2, 4)) net(X) # Forward computation .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[0.0054572 ], [0.00488594]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python import torch from torch import nn net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1)) X = torch.rand(size=(2, 4)) net(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[0.0731], [0.1613]], grad_fn=) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python import tensorflow as tf net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense(4, activation=tf.nn.relu), tf.keras.layers.Dense(1), ]) X = tf.random.uniform((2, 4)) net(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[1].params) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output dense1_ ( Parameter dense1_weight (shape=(1, 8), dtype=float32) Parameter dense1_bias (shape=(1,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[2].state_dict()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output OrderedDict([('weight', tensor([[-0.2665, 0.1665, -0.2725, -0.1111, 0.1236, -0.2030, 0.3430, 0.3423]])), ('bias', tensor([0.0211]))]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net.layers[2].weights) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [, ] .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net[1].bias)) print(net[1].bias) print(net[1].bias.data()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter dense1_bias (shape=(1,), dtype=float32) [0.] Parameters are complex objects, containing values, gradients, and additional information. That's why we need to request the value explicitly. In addition to the value, each parameter also allows us to access the gradient. Because we have not invoked backpropagation for this network yet, it is in its initial state. .. raw:: latex \diilbookstyleinputcell .. code:: python net[1].weight.grad() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[0., 0., 0., 0., 0., 0., 0., 0.]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net[2].bias)) print(net[2].bias) print(net[2].bias.data) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Parameter containing: tensor([0.0211], requires_grad=True) tensor([0.0211]) Parameters are complex objects, containing values, gradients, and additional information. That's why we need to request the value explicitly. In addition to the value, each parameter also allows us to access the gradient. Because we have not invoked backpropagation for this network yet, it is in its initial state. .. raw:: latex \diilbookstyleinputcell .. code:: python net[2].weight.grad == None .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output True .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(type(net.layers[2].weights[1])) print(net.layers[2].weights[1]) print(tf.convert_to_tensor(net.layers[2].weights[1])) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tf.Tensor([0.], shape=(1,), dtype=float32) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net[0].collect_params()) print(net.collect_params()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output dense0_ ( Parameter dense0_weight (shape=(8, 4), dtype=float32) Parameter dense0_bias (shape=(8,), dtype=float32) ) sequential0_ ( Parameter dense0_weight (shape=(8, 4), dtype=float32) Parameter dense0_bias (shape=(8,), dtype=float32) Parameter dense1_weight (shape=(1, 8), dtype=float32) Parameter dense1_bias (shape=(1,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(*[(name, param.shape) for name, param in net[0].named_parameters()]) print(*[(name, param.shape) for name, param in net.named_parameters()]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output ('weight', torch.Size([8, 4])) ('bias', torch.Size([8])) ('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1])) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(net.layers[1].weights) print(net.get_weights()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [, ] [array([[ 0.6733095 , -0.1968227 , 0.70611614, 0.42580146], [ 0.8420772 , -0.05097824, -0.29724634, 0.7435817 ], [-0.55750954, -0.36873615, 0.8461582 , -0.75422806], [-0.09947252, -0.71967113, -0.06134313, -0.12178153]], dtype=float32), array([0., 0., 0., 0.], dtype=float32), array([[ 0.6852075 ], [-0.81062627], [ 0.14000857], [ 0.5764358 ]], dtype=float32), array([0.], dtype=float32)] .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.collect_params()['dense1_bias'].data() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.state_dict()['2.bias'].data .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([0.0211]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.get_weights()[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0., 0., 0., 0.], dtype=float32) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(): net = nn.Sequential() net.add(nn.Dense(32, activation='relu')) net.add(nn.Dense(16, activation='relu')) return net def block2(): net = nn.Sequential() for _ in range(4): # Nested here net.add(block1()) return net rgnet = nn.Sequential() rgnet.add(block2()) rgnet.add(nn.Dense(10)) rgnet.initialize() rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[-6.3465846e-09, -1.1096752e-09, 6.4161787e-09, 6.6354140e-09, -1.1265507e-09, 1.3284951e-10, 9.3619388e-09, 3.2229084e-09, 5.9429879e-09, 8.8181435e-09], [-8.6219423e-09, -7.5150686e-10, 8.3133251e-09, 8.9321128e-09, -1.6740003e-09, 3.2405989e-10, 1.2115976e-08, 4.4926449e-09, 8.0741742e-09, 1.2075874e-08]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(): return nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 4), nn.ReLU()) def block2(): net = nn.Sequential() for i in range(4): # Nested here net.add_module(f'block {i}', block1()) return net rgnet = nn.Sequential(block2(), nn.Linear(4, 1)) rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[-0.3015], [-0.3015]], grad_fn=) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def block1(name): return tf.keras.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense(4, activation=tf.nn.relu)], name=name) def block2(): net = tf.keras.Sequential() for i in range(4): # Nested here net.add(block1(name=f'block-{i}')) return net rgnet = tf.keras.Sequential() rgnet.add(block2()) rgnet.add(tf.keras.layers.Dense(1)) rgnet(X) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet.collect_params) print(rgnet.collect_params()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (1): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (2): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) (3): Sequential( (0): Dense(16 -> 32, Activation(relu)) (1): Dense(32 -> 16, Activation(relu)) ) ) (1): Dense(16 -> 10, linear) )> sequential1_ ( Parameter dense2_weight (shape=(32, 4), dtype=float32) Parameter dense2_bias (shape=(32,), dtype=float32) Parameter dense3_weight (shape=(16, 32), dtype=float32) Parameter dense3_bias (shape=(16,), dtype=float32) Parameter dense4_weight (shape=(32, 16), dtype=float32) Parameter dense4_bias (shape=(32,), dtype=float32) Parameter dense5_weight (shape=(16, 32), dtype=float32) Parameter dense5_bias (shape=(16,), dtype=float32) Parameter dense6_weight (shape=(32, 16), dtype=float32) Parameter dense6_bias (shape=(32,), dtype=float32) Parameter dense7_weight (shape=(16, 32), dtype=float32) Parameter dense7_bias (shape=(16,), dtype=float32) Parameter dense8_weight (shape=(32, 16), dtype=float32) Parameter dense8_bias (shape=(32,), dtype=float32) Parameter dense9_weight (shape=(16, 32), dtype=float32) Parameter dense9_bias (shape=(16,), dtype=float32) Parameter dense10_weight (shape=(10, 16), dtype=float32) Parameter dense10_bias (shape=(10,), dtype=float32) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Sequential( (0): Sequential( (block 0): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 1): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 2): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) (block 3): Sequential( (0): Linear(in_features=4, out_features=8, bias=True) (1): ReLU() (2): Linear(in_features=8, out_features=4, bias=True) (3): ReLU() ) ) (1): Linear(in_features=4, out_features=1, bias=True) ) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python print(rgnet.summary()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= sequential_2 (Sequential) (2, 4) 80 dense_6 (Dense) (2, 1) 5 ================================================================= Total params: 85 Trainable params: 85 Non-trainable params: 0 _________________________________________________________________ None .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet[0][1][0].bias.data() .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet[0][1][0].bias.data .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([-0.1396, 0.2621, -0.4331, -0.2222, -0.3755, -0.0421, -0.4612, -0.4498]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python rgnet.layers[0].layers[1].layers[1].weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

By default, MXNet initializes weight parameters by randomly drawing from a uniform distribution :math:`U(-0.07, 0.07)`, clearing bias parameters to zero. MXNet's ``init`` module provides a variety of preset initialization methods. .. raw:: html

.. raw:: html

By default, PyTorch initializes weight and bias matrices uniformly by drawing from a range that is computed according to the input and output dimension. PyTorch's ``nn.init`` module provides a variety of preset initialization methods. .. raw:: html

.. raw:: html

By default, Keras initializes weight matrices uniformly by drawing from a range that is computed according to the input and output dimension, and the bias parameters are all set to zero. TensorFlow provides a variety of initialization methods both in the root module and the ``keras.initializers`` module. .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # Here `force_reinit` ensures that parameters are freshly initialized even if # they were already initialized previously net.initialize(init=init.Normal(sigma=0.01), force_reinit=True) net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([-0.00324057, -0.00895028, -0.00698632, 0.01030831]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_normal(m): if type(m) == nn.Linear: nn.init.normal_(m.weight, mean=0, std=0.01) nn.init.zeros_(m.bias) net.apply(init_normal) net[0].weight.data[0], net[0].bias.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (tensor([ 0.0007, 0.0208, 0.0103, -0.0094]), tensor(0.)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.random_normal_initializer(mean=0, stddev=0.01), bias_initializer=tf.zeros_initializer()), tf.keras.layers.Dense(1)]) net(X) net.weights[0], net.weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (, ) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.initialize(init=init.Constant(1), force_reinit=True) net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([1., 1., 1., 1.]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def init_constant(m): if type(m) == nn.Linear: nn.init.constant_(m.weight, 1) nn.init.zeros_(m.bias) net.apply(init_constant) net[0].weight.data[0], net[0].bias.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (tensor([1., 1., 1., 1.]), tensor(0.)) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.Constant(1), bias_initializer=tf.zeros_initializer()), tf.keras.layers.Dense(1), ]) net(X) net.weights[0], net.weights[1] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output (, ) .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.initialize(init=init.Xavier(), force_reinit=True) net[1].initialize(init=init.Constant(42), force_reinit=True) print(net[0].weight.data()[0]) print(net[1].weight.data()) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [-0.17594433 0.02314097 -0.1992535 0.09509248] [[42. 42. 42. 42. 42. 42. 42. 42.]] .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python def xavier(m): if type(m) == nn.Linear: nn.init.xavier_uniform_(m.weight) def init_42(m): if type(m) == nn.Linear: nn.init.constant_(m.weight, 42) net[0].apply(xavier) net[2].apply(init_42) print(net[0].weight.data[0]) print(net[2].weight.data) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([-0.2886, -0.6428, -0.2568, 0.0093]) tensor([[42., 42., 42., 42., 42., 42., 42., 42.]]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=tf.keras.initializers.GlorotUniform()), tf.keras.layers.Dense( 1, kernel_initializer=tf.keras.initializers.Constant(42)), ]) net(X) print(net.layers[1].weights[0]) print(net.layers[2].weights[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

Here we define a subclass of the ``Initializer`` class. Usually, we only need to implement the ``_init_weight`` function which takes a tensor argument (``data``) and assigns to it the desired initialized values. .. raw:: latex \diilbookstyleinputcell .. code:: python class MyInit(init.Initializer): def _init_weight(self, name, data): print('Init', name, data.shape) data[:] = np.random.uniform(-10, 10, data.shape) data *= np.abs(data) >= 5 net.initialize(MyInit(), force_reinit=True) net[0].weight.data()[:2] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Init dense0_weight (8, 4) Init dense1_weight (1, 8) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([[ 0. , -0. , -0. , 8.522827 ], [ 0. , -8.828651 , -0. , -5.6012006]]) .. raw:: html

.. raw:: html

Again, we implement a ``my_init`` function to apply to ``net``. .. raw:: latex \diilbookstyleinputcell .. code:: python def my_init(m): if type(m) == nn.Linear: print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0]) nn.init.uniform_(m.weight, -10, 10) m.weight.data *= m.weight.data.abs() >= 5 net.apply(my_init) net[0].weight[:2] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output Init weight torch.Size([8, 4]) Init weight torch.Size([1, 8]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([[ 0.0000, -7.4797, -8.2019, 9.8475], [ 5.0957, 7.4569, -0.0000, -0.0000]], grad_fn=) .. raw:: html

.. raw:: html

Here we define a subclass of ``Initializer`` and implement the ``__call__`` function that return a desired tensor given the shape and data type. .. raw:: latex \diilbookstyleinputcell .. code:: python class MyInit(tf.keras.initializers.Initializer): def __call__(self, shape, dtype=None): data=tf.random.uniform(shape, -10, 10, dtype=dtype) factor=(tf.abs(data) >= 5) factor=tf.cast(factor, tf.float32) return data * factor net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), tf.keras.layers.Dense( 4, activation=tf.nn.relu, kernel_initializer=MyInit()), tf.keras.layers.Dense(1), ]) net(X) print(net.layers[1].weights[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.data()[:] += 1 net[0].weight.data()[0, 0] = 42 net[0].weight.data()[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output array([42. , 1. , 1. , 9.522827]) A note for advanced users: if you want to adjust parameters within an ``autograd`` scope, you need to use ``set_data`` to avoid confusing the automatic differentiation mechanics. .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net[0].weight.data[:] += 1 net[0].weight.data[0, 0] = 42 net[0].weight.data[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([42.0000, -6.4797, -7.2019, 10.8475]) .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net.layers[1].weights[0][:].assign(net.layers[1].weights[0] + 1) net.layers[1].weights[0][0, 0].assign(42) net.layers[1].weights[0] .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output .. raw:: html

.. raw:: html

mxnet pytorch tensorflow

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python net = nn.Sequential() # We need to give the shared layer a name so that we can refer to its # parameters shared = nn.Dense(8, activation='relu') net.add(nn.Dense(8, activation='relu'), shared, nn.Dense(8, activation='relu', params=shared.params), nn.Dense(10)) net.initialize() X = np.random.uniform(size=(2, 20)) net(X) # Check whether the parameters are the same print(net[1].weight.data()[0] == net[2].weight.data()[0]) net[1].weight.data()[0, 0] = 100 # Make sure that they are actually the same object rather than just having the # same value print(net[1].weight.data()[0] == net[2].weight.data()[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output [ True True True True True True True True] [ True True True True True True True True] This example shows that the parameters of the second and third layer are tied. They are not just equal, they are represented by the same exact tensor. Thus, if we change one of the parameters, the other one changes, too. You might wonder, when parameters are tied what happens to the gradients? Since the model parameters contain gradients, the gradients of the second hidden layer and the third hidden layer are added together during backpropagation. .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # We need to give the shared layer a name so that we can refer to its # parameters shared = nn.Linear(8, 8) net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), shared, nn.ReLU(), shared, nn.ReLU(), nn.Linear(8, 1)) net(X) # Check whether the parameters are the same print(net[2].weight.data[0] == net[4].weight.data[0]) net[2].weight.data[0, 0] = 100 # Make sure that they are actually the same object rather than just having the # same value print(net[2].weight.data[0] == net[4].weight.data[0]) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output tensor([True, True, True, True, True, True, True, True]) tensor([True, True, True, True, True, True, True, True]) This example shows that the parameters of the second and third layer are tied. They are not just equal, they are represented by the same exact tensor. Thus, if we change one of the parameters, the other one changes, too. You might wonder, when parameters are tied what happens to the gradients? Since the model parameters contain gradients, the gradients of the second hidden layer and the third hidden layer are added together during backpropagation. .. raw:: html

.. raw:: html

.. raw:: latex \diilbookstyleinputcell .. code:: python # tf.keras behaves a bit differently. It removes the duplicate layer # automatically shared = tf.keras.layers.Dense(4, activation=tf.nn.relu) net = tf.keras.models.Sequential([ tf.keras.layers.Flatten(), shared, shared, tf.keras.layers.Dense(1), ]) net(X) # Check whether the parameters are different print(len(net.layers) == 3) .. raw:: latex \diilbookstyleoutputcell .. parsed-literal:: :class: output True .. raw:: html

.. raw:: html