Source code for hessianfree.gpu.profiling

from __future__ import print_function

import ast
import pstats
import sys
import time
from cProfile import Profile

import numpy as np
import pycuda
import pycuda.autoinit
from pycuda import gpuarray

import hessianfree as hf


[docs]def threshold_calc_G(): """Compare GPU vs CPU performance on feedforward curvature calculation. This can use this to determine whether it is better to run some target network on the CPU or GPU.""" batch_size = range(256, 1025, 256) layer_size = [1] + range(64, 513, 64) reps = 100 times = np.zeros((len(batch_size), len(layer_size), 2)) for i, b in enumerate(batch_size): inputs = np.random.randn(b, 1).astype(np.float32) targets = np.random.randn(b, 1).astype(np.float32) for j, n in enumerate(layer_size): ff = hf.FFNet([1, n, n, 1], use_GPU=False) ff.cache_minibatch(inputs, targets) v = np.random.randn(ff.W.size).astype(np.float32) for _ in range(5): ff.calc_G(v) start = time.time() for _ in range(reps): ff.calc_G(v) times[i, j, 0] = time.time() - start ff = hf.FFNet([1, n, n, 1], use_GPU=True) ff.cache_minibatch(inputs, targets) v = gpuarray.to_gpu(v) for _ in range(5): ff.GPU_calc_G(v) start = time.time() for _ in range(reps): ff.GPU_calc_G(v) v = v.get() times[i, j, 1] = time.time() - start print("b", b, "n", n, "times", times[i, j]) print(times[..., 1] - times[..., 0]) print("batch size (%s) vs layer size (%s)" % (batch_size, layer_size)) print(" (True indicates GPU is faster)") print(times[..., 1] < times[..., 0])
[docs]def threshold_rnn_calc_G(): """Compare GPU vs CPU performance on recurrent curvature calculation. This can use this to determine whether it is better to run some target network on the CPU or GPU.""" batch_size = 1024 layer_size = [1] + range(32, 129, 32) sig_len = [1] + range(8, 33, 8) reps = 100 times = np.zeros((len(sig_len), len(layer_size), 2)) for i, b in enumerate(sig_len): inputs = np.random.randn(batch_size, b, 1).astype(np.float32) targets = np.random.randn(batch_size, b, 1).astype(np.float32) for j, n in enumerate(layer_size): rnn = hf.RNNet([1, n, 1], use_GPU=False) rnn.cache_minibatch(inputs, targets) v = np.random.randn(rnn.W.size).astype(np.float32) for _ in range(5): rnn.calc_G(v) start = time.time() for _ in range(reps): rnn.calc_G(v) times[i, j, 0] = time.time() - start rnn = hf.RNNet([1, n, 1], use_GPU=True) rnn.cache_minibatch(inputs, targets) v = gpuarray.to_gpu(v) for _ in range(5): rnn.GPU_calc_G(v) start = time.time() for _ in range(reps): rnn.GPU_calc_G(v) v = v.get() times[i, j, 1] = time.time() - start print("b", b, "n", n, "times", times[i, j]) print(times[..., 1] - times[..., 0]) print("signal length (%s) versus layer size (%s)" % (sig_len, layer_size)) print(" (True indicates GPU is faster)") print(times[..., 1] < times[..., 0])
[docs]def profile_calc_G(cprofile=True): """Run a profiler on the feedforward curvature calculation. :param bool cprofile: use True if profiling on the CPU, False if using the CUDA profiler """ inputs = np.random.randn(1024, 1).astype(np.float32) targets = np.random.randn(1024, 1).astype(np.float32) N = 1024 ff = hf.FFNet([1, N, N, 1], use_GPU=True) ff.cache_minibatch(inputs, targets) v = np.random.randn(ff.W.size).astype(np.float32) for _ in range(5): # run it a few times to get rid of any startup overhead ff.GPU_calc_G(v) if cprofile: start = time.time() p = Profile() p.enable() else: pycuda.driver.start_profiler() for _ in range(500): _ = ff.GPU_calc_G(v) if cprofile: p.disable() print("time", time.time() - start) ps = pstats.Stats(p) ps.strip_dirs().sort_stats('time').print_stats(20) else: pycuda.driver.stop_profiler()
[docs]def profile_rnn_calc_G(cprofile=True): """Run a profiler on the recurrent curvature calculation. :param bool cprofile: use True if profiling on the CPU, False if using the CUDA profiler """ inputs = np.random.randn(1024, 128, 1).astype(np.float32) targets = np.random.randn(1024, 128, 1).astype(np.float32) N = 128 rnn = hf.RNNet([1, N, 1], use_GPU=True) rnn.optimizer = hf.opt.HessianFree() # for struc_damping check rnn.cache_minibatch(inputs, targets) v = np.random.randn(rnn.W.size).astype(np.float32) for _ in range(2): # run it a few times to get rid of any startup overhead rnn.GPU_calc_G(v) if cprofile: start = time.time() p = Profile() p.enable() else: pycuda.driver.start_profiler() for _ in range(100): _ = rnn.GPU_calc_G(v) if cprofile: p.disable() print("time", time.time() - start) ps = pstats.Stats(p) ps.strip_dirs().sort_stats('time').print_stats(20) else: pycuda.driver.stop_profiler()
[docs]def profile_dot(cprofile=True): """Run a profiler on the matrix multiplication kernel. :param bool cprofile: use True if profiling on the CPU, False if using the CUDA profiler """ N = 1024 a = np.random.randn(N, N).astype(np.float32) b = np.random.randn(N, N).astype(np.float32) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) c_gpu = gpuarray.zeros((N, N), np.float32) for _ in range(2): # run it a few times to get rid of any startup overhead hf.gpu.dot(a_gpu, b_gpu, out=c_gpu) if cprofile: start = time.time() p = Profile() p.enable() else: pycuda.autoinit.context.synchronize() pycuda.driver.start_profiler() for _ in range(100): hf.gpu.dot(a_gpu, b_gpu, out=c_gpu, transpose_a=True, transpose_b=True) c_gpu.get() if cprofile: p.disable() print("time", time.time() - start) ps = pstats.Stats(p) ps.strip_dirs().sort_stats('time').print_stats(20) else: pycuda.driver.stop_profiler()
if __name__ == "__main__": if sys.argv[1] in locals(): locals()[sys.argv[1]](*[ast.literal_eval(a) for a in sys.argv[2:]]) else: print("Unknown profile function (%s)" % sys.argv[1])