本文采用一个模拟数据集进行神经网络的训练,相关知识点包括数据预处理、BN层、神经网络模型、梯度反向传播、梯度检查、监测训练过程、超参数随机搜索等,使读者掌握一个完整的机器学习流程。
1、生成数据
生成一个线性不可分的数据集,它是一个随时间增长的振荡数据。
import numpy as np import matplotlib.pyplot as plt # 显示数据 def show_data(X, labels): plt.scatter(X[:, 0], X[:, 1], c=labels, s=10, cmap=plt.cm.Spectral) plt.show() def gen_random_data(dim, N_class, num_samp_per_class): num_examples = num_samp_per_class*N_class X = np.random.randn(num_examples, dim) # data matrix (each row = single example) labels = np.random.randint(N_class, size = num_examples) # class labels show_data(X, labels) return (X, labels) dim = 2 # dimensionality N_class = 4 # number of classes layer_param = [dim, 10, 20, N_class] (X, labels) = gen_random_data(dim, N_class, num_samp_per_class=20)
讯享网
讯享网# 生成数据 def gen_toy_data(dim, N_class, num_samp_per_class): num_examples = num_samp_per_class * N_class X = np.zeros((num_examples, dim)) # data matrix labels = np.zeros(num_examples, dtype='uint8') # class labels for j in range(N_class): ix = range(num_samp_per_class * j, num_samp_per_class * (j + 1)) x = np.linspace(-np.pi, np.pi, num_samp_per_class) + 5 # x axis y = np.sin(x + j * np.pi / (0.5 * N_class)) y += 0.2 * np.sin(10 * x + j * np.pi / (0.5 * N_class)) y += 0.25 * x + 10 y += np.random.randn(num_samp_per_class) * 0.1 # noise X[ix] = np.c_[x, y] labels[ix] = j show_data(X, labels) return (X, labels)

2、数据预处理
中心化和归一化
def normalize(X): # (x-u)/delta mean = np.mean(X, axis=0) X_norm = X - mean std = np.std(X_norm, axis=0) X_norm /= std + 10(-5) return (X_norm, mean, std)
PCA白化
讯享网def PCA_white(X): mean = np.mean(X, axis=0) X_norm = X - mean cov = np.dot(X_norm.T, X_norm)/X_norm.shape[0] U,S,V = np.linalg.svd(cov) X_norm = np.dot(X_norm, U) X_norm /= np.sqrt(S + 10(-5)) return (X_norm, mean, U, S)
数据集按比例2:1:1随机分为训练集、验证集和测试集
def split_data(X, labels): split1 = 2 split2 = 4 num_examples = X.shape[0] shuffle_no = list(range(num_examples)) np.random.shuffle(shuffle_no) X_train = X[shuffle_no[:num_examples//split1]] labels_train = labels[shuffle_no[:num_examples//split1]] X_val = X[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]] labels_val = labels[shuffle_no[num_examples//split1:num_examples//split1+num_examples//split2]] X_test = X[shuffle_no[-num_examples//4:]] labels_test = labels[shuffle_no[-num_examples//4:]] return (X_train, labels_train, X_val, labels_val, X_test, labels_test)
进行预处理
讯享网def data_preprocess(X_train, X_val, X_test): (X_train_pca, mean, U, S) = PCA_white(X_train) X_val_pca = np.dot(X_val - mean, U) X_val_pca /= np.sqrt(S + 10(-5)) X_test_pca = np.dot(X_test - mean, U) X_test_pca /= np.sqrt(S + 10(-5)) return (X_train_pca, X_val_pca, X_test_pca)
3、网络模型
超参数主要是网络深度(隐含层的数量)和每层神经元的数量。可以采用list结构存储每层神经元数量,包括输入层和输出层。
初始化
# layer_param = [dim, 100, 100, N_class] def initialize_parameters(layer_param): weights = [] biases = [] vweights = [] vbiases = [] for i in range(len(layer_param) - 1): in_depth = layer_param[i] out_depth = layer_param[i+1] std = np.sqrt(2/in_depth)*0.5 weights.append(std*np.random.randn(in_depth, out_depth)) biases.append((np.zeros((1, out_depth)))) vweights.append(np.zeros((in_depth, out_depth))) vbiases.append(np.zeros((1, out_depth))) return (weights, biases, vweights, vbiases)
前向计算
讯享网def forward(X, layer_param, weights, biases): hiddens = [] hiddens.append(X) for i in range(len(layer_param)-2): hiddens.append(np.maximum(0, np.dot(hiddens[i], weights[i]) + biases[i])) # 最后一层不需要非线性激活函数 scores = np.dot(hiddens[-1], weights[-1]) + biases[-1] return (hiddens, scores)
计算softmax数据损失值
def data_loss_softmax(scores, labels): num_examples = scores.shape[0] exp_scores = np.exp(scores) exp_cores_sum = np.sum(exp_scores, axis=1) corect_probs = exp_scores[range(num_examples), labels]/exp_cores_sum corect_logprobs = -np.log(corect_probs) data_loss = np.sum(corect_logprobs)/num_examples return data_loss
计算L2范数损失
讯享网def reg_L2_loss(weights, reg): reg_loss = 0 for weight in weights: reg_loss += 0.5*reg*np.sum(weight*weight) return reg_loss
计算分值矩阵梯度
def dscores_softmax(scores, labels): num_examples = scores.shape[0] exp_scores = np.exp(scores) probs = exp_scores/np.sum(exp_scores, axis=1, keepdims=True) dscores = probs dscores[range(num_examples), labels] -= 1 dscores /= num_examples return dscores
准确率预测,和前向函数几乎一致,只是不需要保存hidden神经元

讯享网def predict(X, labels, layer_param, weights, biases): hidden = X for i in range(len(layer_param)-2): hidden = np.maximum(0, np.dot(hidden, weights[i]) + biases[i]) scores = np.dot(hidden, weights[-1]) + biases[-1] predicted_class = np.argmax(scores, axis=1) right_class = predicted_class == labels return np.mean(right_class)
梯度反向传播算法
def gradient_backprop(dscores, hiddens, weights, biases, reg): dweights = [] dbiases = [] dhidden = dscores for i in range(len(hiddens)-1, -1, -1): dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i]) dbiases.append(np.sum(dhidden, axis=0, keepdims=True)) dhidden = np.dot(dhidden, weights[i].T) dhidden[hiddens[i]<=0] = 0 return (dweights, dbiases)
4、梯度检查
对每层的权重和偏置进行梯度检查
讯享网def check_gradient(X, labels, layer_param, check_weight_or_bias): (X, labels) = gen_random_data(dim, N_class, num_samp_per_class=200) (weights, biases, vweights, vbiases) = initialize_parameters(layer_param) reg = 10(-9) step = 10(-5) for layer in range(len(weights)): if check_weight_or_bias: row = np.random.randint(weights[layer].shape[0]) col = np.random.randint(weights[layer].shape[1]) param = weights[layer][row][col] else: row = np.random.randint(weights[layer].shape[1]) param = biases[layer][0][row] # 前向计算 (hiddens, scores) = forward(X, layer_param, weights, biases) dscores = dscores_softmax(scores, labels) # 反向计算 (dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg) if check_weight_or_bias: danalytic = dweights[-1-layer][row][col] else: danalytic = dbiases[-1-layer][0][row] if check_weight_or_bias: weights[layer][row][col] = param - step else: biases[layer][0][row] = param - step (hiddens, scores) = forward(X, layer_param, weights, biases) data_loss1 = data_loss_softmax(scores, labels) reg_loss1 = reg_L2_loss(weights, reg) loss1 = data_loss1 + reg_loss1 if check_weight_or_bias: weights[layer][row][col] = param + step else: biases[layer][0][row] = param + step (hiddens, scores) = forward(X, layer_param, weights, biases) data_loss2 = data_loss_softmax(scores, labels) reg_loss2 = reg_L2_loss(weights, reg) loss2 = data_loss2 + reg_loss2 dnumeric = (loss2 - loss1)/(2*step) print(layer, data_loss1, data_loss2) error_relative = np.abs(danalytic - dnumeric)/np.maximum(danalytic, dnumeric) print(danalytic, dnumeric, error_relative)
5、参数优化
采用Nesterov动量优化方法,同时函数返回了参数更新比例
def nesterov_momentumSGD(vparams, params, dparams, lr, mu): update_ratio = [] for i in range(len(params)): pre_vparam = vparams[i] vparams[i] = mu*vparams[i] - lr*dparams[-1-i] update_param = vparams[i] + mu * (vparams[i] - pre_vparam) params[i] += update_param update_ratio.append(np.sum(np.abs(update_param)) /np.sum(np.abs(params[i]))) return update_ratio
6、训练网络
初始化参数,前向计算,计算训练集和验证集的准确率,计算数据损失和正则化损失,梯度反向传播,进行学习率指数退化,可视化
讯享网def train_net(X_train, labels_train, layer_param, lr, lr_decay, reg, mu, max_epoch, X_val, labels_val): lr0 = lr (weights, biases, vweights, vbiases) = initialize_parameters(layer_param) epoch = 0 data_losses = [] reg_losses = [] val_accuracy = [] train_accuracy = [] weights_update_ratio = [] baises_update_ratio = [] while epoch < max_epoch: (hiddens, scores) = forward(X_train, layer_param, weights, biases) val_accuracy.append(predict(X_val, labels_val, layer_param, weights, biases)) train_accuracy.append(predict(X_train, labels_train, layer_param, weights, biases)) data_loss = data_loss_softmax(scores, labels_train) reg_loss = reg_L2_loss(weights, reg) dscores = dscores_softmax(scores, labels_train) (dweights, dbiases) = gradient_backprop(dscores, hiddens, weights, biases, reg) weights_update_ratio.append(nesterov_momentumSGD( vweights, weights, dweights, lr, mu )) baises_update_ratio.append(nesterov_momentumSGD( vbiases, biases, dbiases, lr, mu )) data_losses.append(data_loss) reg_losses.append(reg_loss) epoch += 1 lr *= lr_decay # 可视化数据损失、训练集和验证集准确率 plt.close() fig = plt.figure('loss') ax = fig.add_subplot(2, 1, 1) ax.grid(True) ax2 = fig.add_subplot(2, 1, 2) ax2.grid(True) plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)), fontsize=14) plt.ylabel(' accuracy log10(data loss)', fontsize=14) ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b', marker='.') # ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*') ax2.scatter(np.arange(len(val_accuracy) - 0), val_accuracy[0:], c='r', marker='*') ax2.scatter(np.arange(len(val_accuracy) - 0), train_accuracy[0:], c='g', marker='.') # ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*') # ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.') plt.show() # %% 对数显示每层权重和偏置的更新率,合理值在10(-3) for layer in range(len(weights)): wur = [] for i in range(len(weights_update_ratio)): wur.append(weights_update_ratio[i][layer]) bur = [] for i in range(len(baises_update_ratio)): bur.append(baises_update_ratio[i][layer]) plt.close() fig = plt.figure('update ratio') ax = fig.add_subplot(2, 1, 1) ax.grid(True) ax2 = fig.add_subplot(2, 1, 2) ax2.grid(True) plt.xlabel('log10(lr)=' + str(round((np.log10(lr0)), 2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)), 2)), fontsize=14) ax.scatter(np.arange(len(wur)), np.log10(wur), c='b', marker='.') ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r', marker='*') plt.show() return (data_losses, reg_losses, weights, biases, val_accuracy)
7、过拟合小数据集
def overfit_tinydata(lr=10 (-0.0), lr_decay=1, mu=0.9, reg=0, max_epoch=100): (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class=2) X,_,_,_ = PCA_white(X) layer_param = [dim, 100, 100, N_class] (data_losses, reg_losses, weights, biases, accuracy) = train_net(X, labels, layer_param, lr, lr_decay, reg, mu, max_epoch, X, labels) return (data_losses, reg_losses, accuracy)

8、超参数随机搜索
讯享网def hyperparam_random_search(X_train, labels_train, X_val, labels_val, layer_param, num_try=10, lr=[-1, -5], lr_decay=1, mu=0.9, reg=[-2.0, -5.0], max_epoch=500): minlr = min(lr) maxlr = max(lr) randn = np.random.rand(num_try*2) lr_array = 10(minlr + (maxlr-minlr)*randn[0:num_try]) minreg = min(reg) maxreg = max(reg) reg_array = 10(minreg + (maxreg-minreg)*randn[num_try:2*num_try]) lr_regs = zip(lr_array, reg_array) for lr_reg in lr_regs: (data_loss, reg_loss, weights, biases, val_accuracy) = train_net(X_train, labels_train, layer_param, lr_reg[0], lr_decay, lr_reg[1], mu, max_epoch, X_val, labels_val) return (weights, biases)
dim = 2 # dimensionality N_class = 4 # number of classes layer_param = [dim, 100, 100, N_class] (X,labels) = gen_toy_data(dim, N_class, num_samp_per_class = 200) (X_train, labels_train, X_val, labels_val, X_test, labels_test) = split_data(X, labels) (X_train_pca, X_val_pca, X_test_pca) = data_preprocess(X_train, X_val, X_test) (weights, biases) = hyperparam_random_search(X_train_pca, labels_train, X_val_pca, labels_val, layer_param, num_try = 2, lr=[-0.1, -0.2], lr_decay=1, mu=0.9, reg=[-2, -5], max_epoch=2000)
上面设置lr=[-0.1, -0.2]时,数据损失波动很大且下降很快,参数更新率也过高


修改lr=[-1.5, -2.5],数据损失基本无波动,训练集和验证集的准确率趋于饱和,过拟合现象不明显


9、评估模型
讯享网accuracy = predict(X_test_pca, labels_test, layer_param, weights, biases) print(accuracy) # 0.785
10、增加BN层
增加BN层,不会改变程序的总体流程。但是需要注意的是,这里不再需要偏置参数,而是增加了标准查和均值参数,同时需要保留BN层的一些中间结果,如均值和方差等。注意,由于采用全部样本进行训练,所以均值和方差不需要进行移动平均。
BN_EPSILON = 10(-5) def initialize_parameters_BN(layer_param): weights = [] gammas = [] betas = [] vweights = [] vgammas = [] vbetas = [] for i in range(len(layer_param) - 1): in_depth = layer_param[i] out_depth = layer_param[i+1] std = np.sqrt(2/in_depth) weights.append( std * np.random.randn(in_depth, out_depth) ) gammas.append( np.ones((1, out_depth)) ) betas.append( np.zeros((1, out_depth)) ) vweights.append( np.zeros( (in_depth, out_depth) ) ) vgammas.append( np.zeros((1, out_depth)) ) vbetas.append( np.zeros((1, out_depth)) ) params = (weights, gammas, betas) vparams = (vweights, vgammas, vbetas) return (params, vparams) def BN_forward(X, gamma, beta): mu = np.mean(X, axis=0) var = np.var(X, axis=0) std = np.sqrt(var + BN_EPSILON) X_hat = (X - mu)/std Y = gamma*X_hat + beta bn_cache = (mu, var, std) return (Y, bn_cache, X_hat) def BN_backprop(dY, X, X_hat, bn_cache, gamma, beta): (mu, var, std) = bn_cache batch = X.shape[0] dX_hat = gamma*dY dbeta = np.sum(dY, axis=0, keepdims=True) dgamma = np.sum(X_hat*dY, axis=0, keepdims=True) dX = dX_hat/std dmu = -np.sum(dX_hat, axis=0, keepdims=True)/std dstd = -1/(std*std) * np.sum((X - mu)*dX_hat, axis=0) dvar = 1/2*(var(-0.5))*dstd dX += 2*(X - mu)/batch * dvar dX += dmu/batch return (dX, dgamma, dbeta) def forward_BN(X, layer_param, params): hiddens = [] bn_ins = [] bn_caches = [] bn_hats = [] (weights, gammas, betas) = params hiddens.append(X) for i in range(len(layer_param)-1): hidden = np.dot(hiddens[i], weights[i]) # 1 bn_ins.append(hidden) (hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i]) # 2 bn_hats.append(bn_hat) bn_caches.append(bn_cache) if i < len(layer_param)-2: hiddens.append(np.maximum(0, hidden)) # 3 else: scores = hidden # 4 return (hiddens, scores, bn_ins, bn_hats, bn_caches) def predict_BN(X, labels, layer_param, params): (weights, gammas, betas) = params hidden = X for i in range(0,len(layer_param)-1): hidden = np.dot(hidden, weights[i]) (hidden, bn_cache, bn_hat) = BN_forward(hidden, gammas[i], betas[i]) if i < len(layer_param)-2: hidden = np.maximum(0, hidden) else: scores = hidden predicted_class = np.argmax(scores, axis=1) rigth_class = predicted_class == labels return np.mean(rigth_class) def gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg): (weights, gammas, betas) = params dweights = [] dgammas = [] dbetas = [] dhidden = dscores for i in range(len(hiddens)-1, -1, -1): (dhidden, dgamma, dbeta) = BN_backprop(dhidden, bn_ins[i], bn_hats[i], bn_caches[i], gammas[i], betas[i]) dgammas.append(dgamma) dbetas.append(dbeta) dweights.append(np.dot(hiddens[i].T, dhidden) + reg*weights[i] ) dhidden = np.dot(dhidden, weights[i].T) dhidden[hiddens[i] <= 0] = 0 return (dweights, dgammas, dbetas) #%% def train_net_BN(data, layer_param, super_params, max_epoch): (X_train, labels_train, X_val, labels_val) = data (lr_w, lr_decay, mu_w, lr_bn, mu_bn, reg) = super_params (params, vparams) = initialize_parameters_BN(layer_param) epoch = 0 data_losses = [] reg_losses = [] val_accuracy = [] train_accuracy = [] weights_update_ratio = [] gammas_update_ratio = [] betas_update_ratio = [] while epoch < max_epoch: (hiddens, scores, bn_ins, bn_hats, bn_caches) = forward_BN(X_train, layer_param, params) val_accuracy.append(predict_BN(X_val, labels_val, layer_param, params)) train_accuracy.append(predict_BN(X_train, labels_train, layer_param, params)) data_loss = data_loss_softmax(scores, labels_train) reg_loss = reg_L2_loss(params[0], reg) dscores = dscores_softmax(scores, labels_train) (dweights, dgammas, dbetas) = gradient_backprop_BN(dscores, params, hiddens, bn_ins, bn_hats, bn_caches, reg) weights_update_ratio.append(nesterov_momentumSGD(vparams[0], params[0], dweights, lr_w, mu_w)) gammas_update_ratio.append(nesterov_momentumSGD(vparams[1], params[1], dgammas, lr_bn, mu_bn)) betas_update_ratio.append(nesterov_momentumSGD(vparams[2], params[2], dbetas, lr_bn, mu_bn)) data_losses.append(data_loss) reg_losses.append(reg_loss) epoch += 1 lr_w *= lr_decay # 省略可视化结果代码 plt.close() fig=plt.figure('loss') ax=fig.add_subplot(2,1,1) # ax.set_title('data loss') ax.grid(True) ax2=fig.add_subplot(2,1,2) # ax2.set_title('reg loss') ax2.grid(True) plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14) plt.ylabel(' accuracy log10(data loss)', fontsize=14) ax.scatter(np.arange(len(data_losses)), np.log10(data_losses), c='b',marker='.') # ax2.scatter(np.arange(len(reg_losses)), np.log10(reg_losses), c='r',marker='*') # ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(val_accuracy)), c='r',marker='*') # ax2.scatter(np.arange(len(val_accuracy)), np.log10(1-np.array(train_accuracy)), c='g',marker='.') ax2.scatter(np.arange(len(val_accuracy)-0), val_accuracy[0:], c='r',marker='*') ax2.scatter(np.arange(len(val_accuracy)-0), train_accuracy[0:], c='g',marker='.') plt.show() #%% for layer in range(len(params[0])): wur = [] for i in range(len(weights_update_ratio)): wur.append( weights_update_ratio[i][layer] ) bur = [] for i in range(len(betas_update_ratio)): bur.append( betas_update_ratio[i][layer] ) plt.close() fig=plt.figure('update ratio') ax=fig.add_subplot(2,1,1) ax.grid(True) ax2=fig.add_subplot(2,1,2) ax2.grid(True) plt.xlabel( 'log10(lr_w)=' + str(round((np.log10(lr_w)),2)) + ' ' + 'log10(reg)=' + str(round((np.log10(reg)),2)), fontsize=14) # plt.ylabel('log10(baises_update_ratio) log10(weights_update_ratio)', fontsize=14) ax.scatter(np.arange(len(wur)), np.log10(wur), c='b',marker='.') ax2.scatter(np.arange(len(bur)), np.log10(bur), c='r',marker='*') plt.show()

版权声明:本文内容由互联网用户自发贡献,该文观点仅代表作者本人。本站仅提供信息存储空间服务,不拥有所有权,不承担相关法律责任。如发现本站有涉嫌侵权/违法违规的内容,请联系我们,一经查实,本站将立刻删除。
如需转载请保留出处:https://51itzy.com/kjqy/38481.html