Spaces:

teven-projects
/

calculator

Runtime error

App Files Files Community

calculator / optimal_training /conversions.py

teven

changing paths

fadd15c over 2 years ago

raw

history blame contribute delete

8.16 kB

	import math
	import numpy as np
	from scipy.optimize import root

	day_ratio = 24 * 3600

	depth_width_ratio = 128

	constants_per_gpu = {
	"V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00,
	1.32808220e+00, 5.91503856e+00],
	"V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00,
	1.44610885e+00, 5.55824273e+00],
	"P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00,
	1.03031298e+00, 5.38021875e+00],
	"P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00,
	1.04630858e+00, 1.03572754e+01],
	"K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00,
	7.50695980e-01, 6.25951436e+00]

	}

	price_per_gpu = {
	"K80": 0.584,
	"P4": 0.689,
	"V100": 2.005,
	"V100 (without tensor cores and cudnn.benchmark)": 2.005,
	"P100": 1.416,
	}

	optimal_batch_size_per_gpu = {
	"P4": 16,
	"V100": 64,
	"V100 (without tensor cores and cudnn.benchmark)": 64,
	"P100": 64,
	"K80": 16
	}

	features_per_amp_mode = {
	"O0": (1, 0, 0),
	"O1": (0, 1, 0),
	"O2": (0, 0, 1)
	}

	gpu_consumption = {
	"V100": 119.3495934959e-3,
	"V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3,
	"K80": 142.42e-3,
	"P4": 55.27e-3,
	"P100": 139.65e-3
	}

	co2_intensity = 534 * 1e-3


	def flo_speed(features, constants):
	k, k1, k2, b, c, layer_base = constants
	o0, o1, o2, x, y, z = features
	return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c)


	def param_polynomial(width, depth=None, inner=None):
	if depth is not None:
	if inner is not None:
	return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3
	else:
	return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3
	else:
	if inner is not None:
	return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3
	else:
	return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3


	def optimal_model_shape(width, param_number, base=8):
	depth = max(1, math.floor(width / depth_width_ratio))
	poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
	roots = np.roots(poly_params)
	corresponding_width = int(base * round(max(roots) / base))
	return depth, corresponding_width


	def alternate_model_shape(width, param_number, base=8):
	linear_depth = max(1, math.floor(width / depth_width_ratio))
	depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio))
	poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number])
	roots = np.roots(poly_params)
	corresponding_width = int(base * round(max(roots) / base))
	return depth, corresponding_width


	def hours_to_width(hours, gpu, amp_mode, param_popt):
	seconds = hours * 3600
	d, e, f = param_popt
	constants = constants_per_gpu[gpu]
	amp_features = features_per_amp_mode[amp_mode]

	def equation_function(width):
	return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed(
	(*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
	constants) * day_ratio - seconds

	width = iterative_solutions(equation_function, initial_guess=128)
	# print("width: {}".format(math.floor(width)))
	# print("depth: {}".format(width / depth_width_ratio))
	# print("param number: {:.4e}".format(param_polynomial(width)))
	speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
	# print("speed: {:.4e}".format(speed))
	# print("flos from speed: {:.4e}".format(seconds * speed))
	# print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio))
	# print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt))))
	return width


	def iterative_solutions(equation_function, initial_guess):
	while initial_guess > 16:
	solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x
	width = solution_array[0]
	should_be_zero = equation_function(width)
	if np.abs(should_be_zero) < 1e0:
	return width
	else:
	initial_guess *= 0.5
	return width


	def width_to_flo(width, d, e, f):
	return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio


	def loss_fit(x, a, b, c):
	return a * np.power(x, -b) + c


	def param_fit(x, d, e, f):
	return np.log(d * np.power(x, e) + f)


	def hours_to_dollars(hours, gpu):
	return hours * price_per_gpu[gpu]


	def dollars_to_hours(dollars, gpu):
	return dollars / price_per_gpu[gpu]


	def hours_to_kWh(hours, gpu):
	return hours * gpu_consumption[gpu]


	def hours_to_co2(hours, gpu):
	return hours * gpu_consumption[gpu] * co2_intensity


	def loss_to_flo(loss, a, b, c):
	return ((loss - c) / a) ** (-1 / b)


	def param_to_flo(param_number, d, e, f):
	return ((param_number - f) / d) ** (1 / e)


	def safe_flo_to_param(flo, d, e, f):
	return d * np.power(flo, e) + f


	def param_to_width(param_number):
	poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number])
	roots = np.roots(poly_params)
	real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5]
	width = max(real_roots)
	return width


	def safe_param_to_width(param_number):
	try:
	return param_to_width(param_number)
	except np.linalg.LinAlgError:
	return safe_param_to_width(1.5 * param_number)


	def width_to_hours(width, gpu, amp_mode, param_popt):
	d, e, f = param_popt
	constants = constants_per_gpu[gpu]
	amp_features = features_per_amp_mode[amp_mode]
	flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio
	speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants)
	seconds = flos_from_params / speed

	return seconds / 3600


	def param_prime(width, depth=None):
	if depth is not None:
	return 14 * depth * (width ** 2) + 8 * depth + 3
	else:
	return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3


	def flo_speed_prime(width, gpu, amp_mode):
	k, k1, k2, b, c, layer_base = constants_per_gpu[gpu]
	o0, o1, o2 = features_per_amp_mode[amp_mode]
	mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c)
	return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio)
	- np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2)


	# awful equation; we're trying to find the width for which lowering width actually makes the model less efficient
	def tipping_point(gpu, amp_mode, param_popt):
	d, e, f = param_popt
	o0, o1, o2 = features_per_amp_mode[amp_mode]

	def equation_function(width):
	return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \
	* flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]),
	constants_per_gpu[gpu]) - \
	flo_speed_prime(width, gpu, amp_mode)

	tipping_width = iterative_solutions(equation_function, initial_guess=100)
	return tipping_width


	def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt):
	a, b, c = loss_popt
	d, e, f = param_popt
	tip["width"] = width
	tip["param_number"] = param_polynomial(width)
	tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e)
	tip["loss"] = loss_fit(tip["flo"], a, b, c)
	tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt)