Spaces:
Runtime error
Runtime error
| import math | |
| import numpy as np | |
| from scipy.optimize import root | |
| day_ratio = 24 * 3600 | |
| depth_width_ratio = 128 | |
| constants_per_gpu = { | |
| "V100": [2.21527743e+07, 1.18538628e+00, 1.43150104e+00, 1.66015023e+00, | |
| 1.32808220e+00, 5.91503856e+00], | |
| "V100 (without tensor cores and cudnn.benchmark)": [1.82997989e+07, 1.05349588e+00, 1.25312127e+00, 1.67071294e+00, | |
| 1.44610885e+00, 5.55824273e+00], | |
| "P100": [6.01863899e+07, 9.23656025e-01, 1.03230702e+00, 1.46733667e+00, | |
| 1.03031298e+00, 5.38021875e+00], | |
| "P4": [4.84472202e+07, 9.86822195e-01, 1.23474901e+00, 1.38493518e+00, | |
| 1.04630858e+00, 1.03572754e+01], | |
| "K80": [2.58592374e+07, 6.42050890e-01, 7.06115162e-01, 1.44360777e+00, | |
| 7.50695980e-01, 6.25951436e+00] | |
| } | |
| price_per_gpu = { | |
| "K80": 0.584, | |
| "P4": 0.689, | |
| "V100": 2.005, | |
| "V100 (without tensor cores and cudnn.benchmark)": 2.005, | |
| "P100": 1.416, | |
| } | |
| optimal_batch_size_per_gpu = { | |
| "P4": 16, | |
| "V100": 64, | |
| "V100 (without tensor cores and cudnn.benchmark)": 64, | |
| "P100": 64, | |
| "K80": 16 | |
| } | |
| features_per_amp_mode = { | |
| "O0": (1, 0, 0), | |
| "O1": (0, 1, 0), | |
| "O2": (0, 0, 1) | |
| } | |
| gpu_consumption = { | |
| "V100": 119.3495934959e-3, | |
| "V100 (without tensor cores and cudnn.benchmark)": 119.3495934959e-3, | |
| "K80": 142.42e-3, | |
| "P4": 55.27e-3, | |
| "P100": 139.65e-3 | |
| } | |
| co2_intensity = 534 * 1e-3 | |
| def flo_speed(features, constants): | |
| k, k1, k2, b, c, layer_base = constants | |
| o0, o1, o2, x, y, z = features | |
| return k * np.power(k1, o1) * np.power(k2, o2) * x / (x + layer_base) * np.power(y, b) * np.power(np.log(z + 1), c) | |
| def param_polynomial(width, depth=None, inner=None): | |
| if depth is not None: | |
| if inner is not None: | |
| return 5 * depth * (width ** 2) + 2 * depth * (width * inner) + 7 * depth * width + depth * inner + 3 * width + 3 | |
| else: | |
| return 7 * depth * (width ** 2) + 8 * depth * width + 3 * width + 3 | |
| else: | |
| if inner is not None: | |
| return 5 * depth_width_ratio * (width ** 3) + 2 * depth_width_ratio * (width ** 2 * inner) + 7 * depth_width_ratio * width ** 2 + depth_width_ratio * width * inner + 3 * width + 3 | |
| else: | |
| return 7 / depth_width_ratio * (width ** 3) + 8 / depth_width_ratio * (width ** 2) + 3 * width + 3 | |
| def optimal_model_shape(width, param_number, base=8): | |
| depth = max(1, math.floor(width / depth_width_ratio)) | |
| poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number]) | |
| roots = np.roots(poly_params) | |
| corresponding_width = int(base * round(max(roots) / base)) | |
| return depth, corresponding_width | |
| def alternate_model_shape(width, param_number, base=8): | |
| linear_depth = max(1, math.floor(width / depth_width_ratio)) | |
| depth = max(linear_depth + 1, math.floor(0.3 * width ** 1.25 / depth_width_ratio)) | |
| poly_params = np.array([depth * 7, depth * 8 + 3, 3 - param_number]) | |
| roots = np.roots(poly_params) | |
| corresponding_width = int(base * round(max(roots) / base)) | |
| return depth, corresponding_width | |
| def hours_to_width(hours, gpu, amp_mode, param_popt): | |
| seconds = hours * 3600 | |
| d, e, f = param_popt | |
| constants = constants_per_gpu[gpu] | |
| amp_features = features_per_amp_mode[amp_mode] | |
| def equation_function(width): | |
| return np.power((param_polynomial(width) - f) / d, 1 / e) / flo_speed( | |
| (*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), | |
| constants) * day_ratio - seconds | |
| width = iterative_solutions(equation_function, initial_guess=128) | |
| # print("width: {}".format(math.floor(width))) | |
| # print("depth: {}".format(width / depth_width_ratio)) | |
| # print("param number: {:.4e}".format(param_polynomial(width))) | |
| speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants) | |
| # print("speed: {:.4e}".format(speed)) | |
| # print("flos from speed: {:.4e}".format(seconds * speed)) | |
| # print("flos from params: {:.4e}".format(np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio)) | |
| # print("params from flos: {:.4e}".format(np.exp(param_fit(speed * seconds / day_ratio, *param_popt)))) | |
| return width | |
| def iterative_solutions(equation_function, initial_guess): | |
| while initial_guess > 16: | |
| solution_array = root(equation_function, np.array([initial_guess]), method="hybr").x | |
| width = solution_array[0] | |
| should_be_zero = equation_function(width) | |
| if np.abs(should_be_zero) < 1e0: | |
| return width | |
| else: | |
| initial_guess *= 0.5 | |
| return width | |
| def width_to_flo(width, d, e, f): | |
| return np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio | |
| def loss_fit(x, a, b, c): | |
| return a * np.power(x, -b) + c | |
| def param_fit(x, d, e, f): | |
| return np.log(d * np.power(x, e) + f) | |
| def hours_to_dollars(hours, gpu): | |
| return hours * price_per_gpu[gpu] | |
| def dollars_to_hours(dollars, gpu): | |
| return dollars / price_per_gpu[gpu] | |
| def hours_to_kWh(hours, gpu): | |
| return hours * gpu_consumption[gpu] | |
| def hours_to_co2(hours, gpu): | |
| return hours * gpu_consumption[gpu] * co2_intensity | |
| def loss_to_flo(loss, a, b, c): | |
| return ((loss - c) / a) ** (-1 / b) | |
| def param_to_flo(param_number, d, e, f): | |
| return ((param_number - f) / d) ** (1 / e) | |
| def safe_flo_to_param(flo, d, e, f): | |
| return d * np.power(flo, e) + f | |
| def param_to_width(param_number): | |
| poly_params = np.array([7 / depth_width_ratio, 8 / depth_width_ratio, 3, 3 - param_number]) | |
| roots = np.roots(poly_params) | |
| real_roots = [np.real(candidate) for candidate in roots if np.imag(candidate) < 1e-5] | |
| width = max(real_roots) | |
| return width | |
| def safe_param_to_width(param_number): | |
| try: | |
| return param_to_width(param_number) | |
| except np.linalg.LinAlgError: | |
| return safe_param_to_width(1.5 * param_number) | |
| def width_to_hours(width, gpu, amp_mode, param_popt): | |
| d, e, f = param_popt | |
| constants = constants_per_gpu[gpu] | |
| amp_features = features_per_amp_mode[amp_mode] | |
| flos_from_params = np.power((param_polynomial(width) - f) / d, 1 / e) * day_ratio | |
| speed = flo_speed((*amp_features, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), constants) | |
| seconds = flos_from_params / speed | |
| return seconds / 3600 | |
| def param_prime(width, depth=None): | |
| if depth is not None: | |
| return 14 * depth * (width ** 2) + 8 * depth + 3 | |
| else: | |
| return 21 / depth_width_ratio * (width ** 2) + 16 / depth_width_ratio * width + 3 | |
| def flo_speed_prime(width, gpu, amp_mode): | |
| k, k1, k2, b, c, layer_base = constants_per_gpu[gpu] | |
| o0, o1, o2 = features_per_amp_mode[amp_mode] | |
| mult_constant = k * np.power(k1, o1) * np.power(k2, o2) * np.power(np.log(optimal_batch_size_per_gpu[gpu] + 1), c) | |
| return mult_constant * ((b + 1) * np.power(width, b) / (width + layer_base * depth_width_ratio) | |
| - np.power(width, b + 1) / (width + layer_base * depth_width_ratio) ** 2) | |
| # awful equation; we're trying to find the width for which lowering width actually makes the model less efficient | |
| def tipping_point(gpu, amp_mode, param_popt): | |
| d, e, f = param_popt | |
| o0, o1, o2 = features_per_amp_mode[amp_mode] | |
| def equation_function(width): | |
| return np.power((param_polynomial(width) - f) / d, -1) / e * param_prime(width) / d \ | |
| * flo_speed((o0, o1, o2, width / depth_width_ratio, width, optimal_batch_size_per_gpu[gpu]), | |
| constants_per_gpu[gpu]) - \ | |
| flo_speed_prime(width, gpu, amp_mode) | |
| tipping_width = iterative_solutions(equation_function, initial_guess=100) | |
| return tipping_width | |
| def update_tip(tip, width, gpu, amp_mode, loss_popt, param_popt): | |
| a, b, c = loss_popt | |
| d, e, f = param_popt | |
| tip["width"] = width | |
| tip["param_number"] = param_polynomial(width) | |
| tip["flo"] = np.power((param_polynomial(tip["param_number"]) - f) / d, 1 / e) | |
| tip["loss"] = loss_fit(tip["flo"], a, b, c) | |
| tip["hours"] = width_to_hours(width, gpu, amp_mode, param_popt) | |