import numpy as np import time import ctypes import subprocess import os import math gpus_tested = False gpus_found = 0 kernels_found = True try: import permutation_search_cuda as permutation_search_cuda_kernels print(f"Found permutation search CUDA kernels") except ImportError: print(f"Could not find permutation search CUDA kernels, falling back to CPU path") kernels_found = False def use_gpu(initial_override = True): global gpus_tested, gpus_found, kernels_found if not gpus_tested: if not initial_override: gpus_tested = True return False try: gpus_found = str(subprocess.check_output(["nvidia-smi", "-L"])).count('UUID') print(f"Found {gpus_found} gpus") except: gpus_found = 0 print(f"Could not find nvidia-smi, please check your cuda installation") gpus_tested = True return gpus_found > 0 and kernels_found ############################################################################################## # pruning utilities ############################################################################################## ## apply 2:4 to some matrix def apply_2_to_4(matrix): for row in range(matrix.shape[0]): for col in range(0,matrix.shape[1],4): ix = np.argsort(np.abs(matrix[row,col:col+4])) matrix[row,col+ix[0]] = 0.0 matrix[row,col+ix[1]] = 0.0 return matrix ## find the sum of magnitudes if 2:4 were applied to a matrix def sum_after_2_to_4(matrix): #matrix = np.copy(matrix) cur_sum = 0.0 use_cuda = use_gpu() if not use_cuda: start_time = time.perf_counter() for row in range(matrix.shape[0]): for col in range(0,matrix.shape[1],4): ix = np.argsort(np.abs(matrix[row,col:col+4])) cur_sum += abs(matrix[row,col+ix[2]]) cur_sum += abs(matrix[row,col+ix[3]]) np_elapsed = time.perf_counter() - start_time else: matrix = matrix.astype(np.float32) cuda_sum = np.zeros((1), dtype=np.float32) start_time = time.perf_counter() matrix_view = np.copy(matrix).flatten() sum_view = cuda_sum.flatten() blocks = max(int(matrix.shape[1]/4/2), 1) threads = min(max(math.ceil(matrix.shape[0]/4), 1), 1024) result = permutation_search_cuda_kernels.sum_after_2_to_4(matrix_view, matrix.shape[0], matrix.shape[1], 0, matrix.shape[1], blocks, threads, sum_view) cuda_elapsed = time.perf_counter() - start_time #print(cuda_sum, cuda_elapsed, cur_sum, np_elapsed, np_elapsed/cuda_elapsed) cur_sum = sum_view[0] return cur_sum ## try swapping columns and tracking magnitude after pruning def try_swap(matrix, dst, src): src_base = sum_after_2_to_4(matrix[...,int(src/4)*4:int(src/4)*4+4]) dst_base = sum_after_2_to_4(matrix[...,int(dst/4)*4:int(dst/4)*4+4]) # swap matrix[...,[src,dst]] = matrix[...,[dst,src]] # check the Nx4 slices of the swapped columns src_sum = sum_after_2_to_4(matrix[...,int(src/4)*4:int(src/4)*4+4]) dst_sum = sum_after_2_to_4(matrix[...,int(dst/4)*4:int(dst/4)*4+4]) # swap back matrix[...,[src,dst]] = matrix[...,[dst,src]] return src_sum + dst_sum, (src_sum + dst_sum) - (src_base + dst_base) ############################################################################################## # permutation utilities ############################################################################################## ## find the permutation needed to make matrix A look like matrix B def find_permutation(A, B): permutation = [] for col in range(A.shape[1]): Avals = A[...,col] for bcol in range(B.shape[1]): if np.all(Avals - B[...,bcol] == np.zeros(Avals.shape)): permutation.append(bcol) break return permutation