!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2021 NVIDIA Corporation
Built on Sun_Feb_14_21:12:58_PST_2021
Cuda compilation tools, release 11.2, V11.2.152
Build cuda_11.2.r11.2/compiler.29618528_0


!nvidia-smi

Tue Oct 11 15:39:02 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.63.01    Driver Version: 470.63.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|===============================+======================+======================|
|   0  NVIDIA GeForce ...  Off  | 00000000:19:00.0 Off |                  N/A |
|  0%   32C    P8    19W / 260W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  NVIDIA GeForce ...  Off  | 00000000:1A:00.0 Off |                  N/A |
| 27%   29C    P8     4W / 250W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   2  NVIDIA GeForce ...  Off  | 00000000:67:00.0 Off |                  N/A |
| 27%   31C    P8     4W / 250W |      0MiB / 11019MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   3  NVIDIA GeForce ...  Off  | 00000000:68:00.0 Off |                  N/A |
| 27%   35C    P8    26W / 250W |      0MiB / 11016MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|        ID   ID                                                   Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+


import tensorflow as tf
print("TensorFlow Version:", tf.__version__)
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

TensorFlow Version: 2.9.1
Num GPUs Available:  4


import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt


gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Restrict TensorFlow to only use the first GPU
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

4 Physical GPUs, 1 Logical GPUs


# Helper function
def show_images(image, num_row=2, num_col=5):
    # plot images
    fig, axes = plt.subplots(num_row, num_col, figsize=(1.5*num_col,2*num_row))
    for i in range(num_row*num_col):
        ax = axes[i//num_col, i%num_col]
        ax.imshow(image[i], cmap='gray', vmin=0, vmax=1)
        ax.axis('off')
    plt.tight_layout()
    plt.show()


mnist = tf.keras.datasets.mnist

(x_train, y_train), (x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0


print("Training data: {}".format(x_train.shape))
print("Test data: {}".format(x_test.shape))
show_images(x_train)

Training data: (60000, 28, 28)
Test data: (10000, 28, 28)


model = tf.keras.models.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(10)
])


predictions = model(x_train[:1]).numpy()
predictions

array([[-0.11171019, -0.47011006, -0.481782  , -0.05107722,  0.11246532,
         1.1753833 , -0.37382746, -0.34236422, -0.21776126,  0.66296977]],
      dtype=float32)


tf.nn.softmax(predictions).numpy()

array([[0.07717101, 0.05392661, 0.05330085, 0.08199489, 0.09656338,
        0.27953222, 0.05937699, 0.06127489, 0.06940597, 0.16745321]],
      dtype=float32)


loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)


loss_fn(y_train[:1], predictions).numpy()

1.2746377


model.compile(optimizer='adam',
              loss=loss_fn,
              metrics=['accuracy'])


model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 128)               100480    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
=================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


model.fit(x_train, y_train, batch_size=32, epochs=5)

Epoch 1/5
1875/1875 [==============================] - 7s 3ms/step - loss: 0.2956 - accuracy: 0.9134
Epoch 2/5
1875/1875 [==============================] - 6s 3ms/step - loss: 0.1437 - accuracy: 0.9574
Epoch 3/5
1875/1875 [==============================] - 6s 3ms/step - loss: 0.1066 - accuracy: 0.9675
Epoch 4/5
1875/1875 [==============================] - 6s 3ms/step - loss: 0.0857 - accuracy: 0.9732
Epoch 5/5
1875/1875 [==============================] - 6s 3ms/step - loss: 0.0738 - accuracy: 0.9764

<keras.callbacks.History at 0x7fac3c5f4460>


model.evaluate(x_test, y_test, verbose=2)

313/313 - 1s - loss: 0.0744 - accuracy: 0.9762 - 837ms/epoch - 3ms/step

[0.07441160082817078, 0.9761999845504761]


# Destroys the current TF graph and creates a new one.
tf.keras.backend.clear_session()


inputs = tf.keras.Input(shape=(28, 28))


print("Shape:", inputs.shape)
print("dtype:", inputs.dtype)

Shape: (None, 28, 28)
dtype: <dtype: 'float32'>


x = tf.keras.layers.Flatten()(inputs)
x = tf.keras.layers.Dense(128, activation="relu")(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = tf.keras.layers.Dense(10)(x)


model = tf.keras.Model(inputs=inputs, outputs=outputs, name="mnist_model")
model.summary()

Model: "mnist_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 input_1 (InputLayer)        [(None, 28, 28)]          0         
                                                                 
 flatten (Flatten)           (None, 784)               0         
                                                                 
 dense (Dense)               (None, 128)               100480    
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 10)                1290      
                                                                 
=================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy']
             )

history = model.fit(x_train, y_train, batch_size=64, epochs=5, validation_split=0.2)

test_scores = model.evaluate(x_test, y_test, verbose=2)
print("Test loss:", test_scores[0])
print("Test accuracy:", test_scores[1])

Epoch 1/5
750/750 [==============================] - 3s 4ms/step - loss: 0.3659 - accuracy: 0.8962 - val_loss: 0.1813 - val_accuracy: 0.9498
Epoch 2/5
750/750 [==============================] - 3s 4ms/step - loss: 0.1762 - accuracy: 0.9479 - val_loss: 0.1275 - val_accuracy: 0.9638
Epoch 3/5
750/750 [==============================] - 3s 4ms/step - loss: 0.1332 - accuracy: 0.9612 - val_loss: 0.1111 - val_accuracy: 0.9680
Epoch 4/5
750/750 [==============================] - 3s 4ms/step - loss: 0.1077 - accuracy: 0.9683 - val_loss: 0.0984 - val_accuracy: 0.9715
Epoch 5/5
750/750 [==============================] - 3s 4ms/step - loss: 0.0900 - accuracy: 0.9737 - val_loss: 0.0944 - val_accuracy: 0.9723
313/313 - 1s - loss: 0.0862 - accuracy: 0.9745 - 759ms/epoch - 2ms/step
Test loss: 0.08624877035617828
Test accuracy: 0.9745000004768372


# Destroys the current TF graph and creates a new one.
tf.keras.backend.clear_session()


train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(10000).batch(32)
test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)


class MyModel(tf.keras.Model):
    def __init__(self):
        super(MyModel, self).__init__()
        self.flatten = tf.keras.layers.Flatten()
        self.dropout = tf.keras.layers.Dropout(0.2)
        self.d1 = tf.keras.layers.Dense(128, activation='relu')
        self.d2 = tf.keras.layers.Dense(10)

    def call(self, x):
        x = self.flatten(x)
        x = self.d1(x)
        x = self.dropout(x)
        return self.d2(x)


# Create an instance of the model
model = MyModel()
model.build(input_shape=(None, 28, 28))
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
=================================================================
 flatten (Flatten)           multiple                  0         
                                                                 
 dropout (Dropout)           multiple                  0         
                                                                 
 dense (Dense)               multiple                  100480    
                                                                 
 dense_1 (Dense)             multiple                  1290      
                                                                 
=================================================================
Total params: 101,770
Trainable params: 101,770
Non-trainable params: 0
_________________________________________________________________


loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
optimizer = tf.keras.optimizers.Adam()


train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.SparseCategoricalAccuracy(name='test_accuracy')


@tf.function
def train_step(images, labels):
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = model(images, training=True)
        loss = loss_object(labels, predictions)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    train_loss(loss)
    train_accuracy(labels, predictions)


@tf.function
def test_step(images, labels):
    # training=False is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = model(images, training=False)
    t_loss = loss_object(labels, predictions)

    test_loss(t_loss)
    test_accuracy(labels, predictions)


EPOCHS = 5

for epoch in range(EPOCHS):
    # Reset the metrics at the start of the next epoch
    train_loss.reset_states()
    train_accuracy.reset_states()
    test_loss.reset_states()
    test_accuracy.reset_states()

    for images, labels in train_ds:
        train_step(images, labels)

    for test_images, test_labels in test_ds:
        test_step(test_images, test_labels)
        
    template = 'Epoch {:0}, Loss: {:.4f}, Accuracy: {:.4f}, Test Loss: {:.4f}, Test Accuracy: {:.4f}'
    print (template.format(epoch+1,
                           train_loss.result(),
                           train_accuracy.result()*100,
                           test_loss.result(),
                           test_accuracy.result()*100))

Epoch 1, Loss: 0.2975, Accuracy: 91.5217, Test Loss: 0.1409, Test Accuracy: 95.7200
Epoch 2, Loss: 0.1433, Accuracy: 95.7333, Test Loss: 0.1063, Test Accuracy: 96.7700
Epoch 3, Loss: 0.1091, Accuracy: 96.7350, Test Loss: 0.0889, Test Accuracy: 97.2800
Epoch 4, Loss: 0.0888, Accuracy: 97.2033, Test Loss: 0.0842, Test Accuracy: 97.5000
Epoch 5, Loss: 0.0760, Accuracy: 97.6233, Test Loss: 0.0718, Test Accuracy: 97.8000


def f_eager(x, y):
    for i in tf.range(100000):
        _ = tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)

@tf.function
def f_graph(x, y):
    for i in tf.range(100000):
        _ = tf.reduce_mean(tf.multiply(x ** 2, 3) + y)
    return tf.reduce_mean(tf.multiply(x ** 2, 3) + y)


x = tf.constant([[2.0, 3.0]])
y = tf.constant([[3.0, -2.0]])


# `f_eager` and `f_graph` return same values, but `f_graph` is executed as a TensorFlow graph.
assert f_eager(x, y).numpy() == f_graph(x, y).numpy()


%time _ = f_eager(x, y)

CPU times: user 23.8 s, sys: 0 ns, total: 23.8 s
Wall time: 23.7 s


%time _ = f_graph(x, y)

CPU times: user 1.56 ms, sys: 0 ns, total: 1.56 ms
Wall time: 937 µs


@tf.function
def f(x):
    print("Traced with", x)
    tf.print("Executed with", x)


f(1)
f(1)
f(2)

Traced with 1
Executed with 1
Executed with 1
Traced with 2
Executed with 2


g = 0

@tf.function
def mutate_globals(x):
    return x + g

# tf.function captures the value of the global during the first run
print("First call: ", mutate_globals(tf.constant(1)))
g = 10  # Update the global

# Subsequent runs may silently use the cached value of the globals
print("Second call: ", mutate_globals(tf.constant(2)))

# tf.function re-runs the Python function when the type or shape of the argument changes
# This will end up reading the latest value of the global
print("Third call, different type: ", mutate_globals(tf.constant([4.])))

First call:  tf.Tensor(1, shape=(), dtype=int32)
Second call:  tf.Tensor(2, shape=(), dtype=int32)
Third call, different type:  tf.Tensor([14.], shape=(1,), dtype=float32)


def log1pexp(x):
    return tf.math.log(1 + tf.exp(x))


x = tf.constant(100.)
with tf.GradientTape() as g:
    g.watch(x)
    y = log1pexp(x)
dy = g.gradient(y, x) # Will be evaluated as NaN
print("dy/dx =", dy.numpy())

dy/dx = nan


@tf.custom_gradient
def log1pexp(x):
    e = tf.exp(x)
    def grad(dy):
        return dy * (1 - 1 / (1 + e))
    return tf.math.log(1 + e), grad


x = tf.constant(100.)
with tf.GradientTape() as g:
    g.watch(x)
    y = log1pexp(x)
dy = g.gradient(y, x) # Will be evaluated as 1.0
print("dy/dx =", dy.numpy())

dy/dx = 1.0

TensorFlow 101¶

Why TensorFlow?¶

Environment setup¶

Software requirements¶

Install CUDA with apt¶

Ubuntu 18.04 (CUDA 10.1)¶

Ubuntu 16.04 (CUDA 10.1)¶

Install CUDA with Anaconda¶

Install TensorFlow 2¶

Google Colab¶

TensorFlow 2 quickstart¶

Limit GPU memory growth¶

Load dataset via tf.keras.dataset¶

Build model via Sequential API¶

When to use?¶

Build model via Functional API¶

When to use?¶

Build model via Model Subclassing¶

When to use?¶

Sequential API, Functional API, and Model Subclassing¶

Better performance with tf.function¶

Debugging¶

Python side effects¶

Customize gradient flow by tf.custom_gradient¶

Reference¶