Implementando nuestra red para clasificar dígitos

Usaremos el conjunto de datos MNIST


import random
import numpy as np

class Network(object):

    def __init__(self, sizes):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
        self.weights = [np.random.randn(y, x) 
                        for x, y in zip(sizes[:-1], sizes[1:])]
    

Utilizando la función np.random.randn de Numpy para generar distribuciones gaussianas con media 0 y desviación estándar 1.

En este código, la lista de sizes contiene la cantidad de neuronas en las capas respectivas.

self.biases es una lista de listas, donde cada lista tiene los sesgos para cada capa, excepto la capa de entrada.

self.weigths es una lista con donde cada elemento es la matriz de pesos de cada capa, excepto la capa de entrada. Así, cada matriz de la capa enésima tiene tantas filas como neuronas tiene la capa enésima y cada fila es un vector de pesos del tamaño de la capa anterior.

Las siguientes funciones están dentro de la clase Network


def sigmoid(z):
    '''Función sigmoide'''
    return 1.0/(1.0+np.exp(-z))

def sigmoid_prime(z):
    '''Derivada de la función sigmoidea'''
    return sigmoid(z)*(1-sigmoid(z))

Tenga en cuenta que cuando la entrada z es un vector o una matriz Numpy, Numpy aplica automáticamente la función sigmoide elemento por elemento, es decir, en forma vectorizada.


def feedforward(self, a):
    for b, w in zip(self.biases, self.weights):
        a = sigmoid(np.dot(w, a)+b)
    return a

Función del descenso del grandiete estocástico


def SGD(self, training_data, epochs, mini_batch_size, eta,
         test_data=None):

    if test_data: 
        n_test = len(test_data)

    n = len(training_data)

    for j in range(epochs):
        random.shuffle(training_data)
        mini_batches = [
            training_data[k:k+mini_batch_size]
            for k in range(0, n, mini_batch_size)
            ]
        for mini_batch in mini_batches:
            self.update_mini_batch(mini_batch, eta)
        if test_data:
            print "Epoch {0}: {1} / {2}".format(
                j, self.evaluate(test_data), n_test)
        else:
            print "Epoch {0} complete".format(j)

 def update_mini_batch(self, mini_batch, eta):

    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]

    for x, y in mini_batch:
        delta_nabla_b, delta_nabla_w = self.backprop(x, y)
        nabla_b = [nb+dnb for 
                  nb, dnb in zip(nabla_b, delta_nabla_b)]
        nabla_w = [nw+dnw for 
                  nw, dnw in zip(nabla_w, delta_nabla_w)]

    self.weights = [w-(eta/len(mini_batch))*nw 
                        for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b-(eta/len(mini_batch))*nb 
                       for b, nb in zip(self.biases, nabla_b)]

Algoritmo backpropagation

Devuelve una tupla (nabla_b, nabla_w) que representa el gradiente de la función de costo C_x. nabla_b y nabla_w son listas capa por capa de matrices numpy, similares a self.biases y self.weights.


def backprop(self, x, y):

    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]
    '''retroalimentación hacia adelante'''  
    activation = x
    '''lista para almacenar todas las activaciones,  capa por capa'''
    activations = [x]
    '''lista para almacenar todos los vectores z, capa por capa'''
    zs = [] 
    for b, w in zip(self.biases, self.weights):
        z = np.dot(w, activation)+b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
    '''pase hacia atrás'''
    delta = self.cost_derivative(activations[-1], y) * self.sigmoid_prime(zs[-1])
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())
        
    for l in range(2, self.num_layers):
        z = zs[-l]
        sp = self.sigmoid_prime(z)
        delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
        nabla_b[-l] = delta
        nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
    return (nabla_b, nabla_w)


def cost_derivative(self, output_activations, y):
    """Devuelve el vector de derivadas parciales \partial C_x /
       \partial a para las activaciones de salida."""
    return (output_activations-y) 

Ejemplo de instancia


net = network.Network([784, 30, 10])
training_data, validation_data, test_data = load_data()
net.SGD(training_data, 30, 10, 3.0, test_data=test_data)

Red mejorada


class Network(object):

    def __init__(self, sizes, cost=CrossEntropyCost):
        self.num_layers = len(sizes)
        self.sizes = sizes
        self.default_weight_initializer()
        self.cost=cost


def default_weight_initializer(self):
    self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
    self.weights = [np.random.randn(y, x)/np.sqrt(x) 
                  for x, y in zip(self.sizes[:-1], self.sizes[1:])]

def large_weight_initializer(self):
    self.biases = [np.random.randn(y, 1) for y in self.sizes[1:]]
    self.weights = [np.random.randn(y, x) 
                  for x, y in zip(self.sizes[:-1], self.sizes[1:])]

class CrossEntropyCost(object):

    @staticmethod
    def fn(a, y):
        return np.sum(np.nan_to_num(-y*np.log(a)-(1-y)*np.log(1-a)))

    @staticmethod
    def delta(z, a, y):
        return (a-y)

class QuadraticCost(object):

    @staticmethod
    def fn(a, y):
        return 0.5*np.linalg.norm(a-y)**2

    @staticmethod
    def delta(z, a, y):
        return (a-y) * sigmoid_prime(z)

def SGD(self, training_data, epochs, mini_batch_size, eta,
            lmbda = 0.0,
            evaluation_data=None,
            monitor_evaluation_cost=False,
            monitor_evaluation_accuracy=False,
            monitor_training_cost=False,
            monitor_training_accuracy=False):

    if evaluation_data: n_data = len(evaluation_data)
    n = len(training_data)
    evaluation_cost, evaluation_accuracy = [], []
    training_cost, training_accuracy = [], []
    for j in range(epochs):
        random.shuffle(training_data)
        mini_batches = [
                training_data[k:k+mini_batch_size]
                for k in xrange(0, n, mini_batch_size)]
        for mini_batch in mini_batches:
            self.update_mini_batch(
                    mini_batch, eta, lmbda, len(training_data))
        print "Epoch %s training complete" % j
        if monitor_training_cost:
            cost = self.total_cost(training_data, lmbda)
            training_cost.append(cost)
            print "Cost on training data: {}".format(cost)
        if monitor_training_accuracy:
            accuracy = self.accuracy(training_data, convert=True)
            training_accuracy.append(accuracy)
            print "Accuracy on training data: {} / {}".format(
                    accuracy, n)
        if monitor_evaluation_cost:
            cost = self.total_cost(evaluation_data, lmbda, convert=True)
            evaluation_cost.append(cost)
            print "Cost on evaluation data: {}".format(cost)
        if monitor_evaluation_accuracy:
            accuracy = self.accuracy(evaluation_data)
            evaluation_accuracy.append(accuracy)
            print "Accuracy on evaluation data: {} / {}".format(
                    self.accuracy(evaluation_data), n_data)
        return evaluation_cost, evaluation_accuracy, \
            training_cost, training_accuracy


def update_mini_batch(self, mini_batch, eta, lmbda, n):
    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]
    for x, y in mini_batch:
        delta_nabla_b, delta_nabla_w = self.backprop(x, y)
        nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
        nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
    self.weights = [(1-eta*(lmbda/n))*w-(eta/len(mini_batch))*nw
                        for w, nw in zip(self.weights, nabla_w)]
    self.biases = [b-(eta/len(mini_batch))*nb
                       for b, nb in zip(self.biases, nabla_b)]

def backprop(self, x, y):

    nabla_b = [np.zeros(b.shape) for b in self.biases]
    nabla_w = [np.zeros(w.shape) for w in self.weights]
    
    activation = x
    activations = [x] 
    zs = [] 
    for b, w in zip(self.biases, self.weights):
        z = np.dot(w, activation)+b
        zs.append(z)
        activation = sigmoid(z)
        activations.append(activation)
       
    delta = (self.cost).delta(zs[-1], activations[-1], y)
    nabla_b[-1] = delta
    nabla_w[-1] = np.dot(delta, activations[-2].transpose())

    for l in range(2, self.num_layers):
        z = zs[-l]
        sp = sigmoid_prime(z)
        delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
        nabla_b[-l] = delta
        nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
    return (nabla_b, nabla_w)

def accuracy(self, data, convert=False):
    if convert:
        results = [(np.argmax(self.feedforward(x)), np.argmax(y))
                    for (x, y) in data]
    else:
        results = [(np.argmax(self.feedforward(x)), y)
                    for (x, y) in data]
    return sum(int(x == y) for (x, y) in results)

def total_cost(self, data, lmbda, convert=False):
    cost = 0.0
    for x, y in data:
        a = self.feedforward(x)
        if convert: y = vectorized_result(y)
        cost += self.cost.fn(a, y)/len(data)
    cost += 0.5*(lmbda/len(data))*sum(
        np.linalg.norm(w)**2 for w in self.weights)
    return cost

import json
def save(self, filename):
    """Guarde la red neuronal en el archivo ``filename``."""
    data = {"sizes": self.sizes,
            "weights": [w.tolist() for w in self.weights],
            "biases": [b.tolist() for b in self.biases],
            "cost": str(self.cost.__name__)}
    f = open(filename, "w")
    json.dump(data, f)
    f.close()

def load(filename):
    """Carga una red neuronal desde el archivo ``filename``.
     Devuelve una instancia de Network."""
    f = open(filename, "r")
    data = json.load(f)
    f.close()
    cost = getattr(sys.modules[__name__], data["cost"])
    net = Network(data["sizes"], cost=cost)
    net.weights = [np.array(w) for w in data["weights"]]
    net.biases = [np.array(b) for b in data["biases"]]
    return net

def vectorized_result(j):
    '''Devuelve un vector unitario de 10 dimensiones con un 1,0 en la 
    posición j y ceros en el resto. Esto se utiliza para convertir un 
    dígito (0...9)en una salida deseada correspondiente de la red 
    neuronal.'''
    e = np.zeros((10, 1))
    e[j] = 1.0
    return e

Referencia