我正在尝试利用NumPy广播和后端数组计算来显着加快此功能。不幸的是,它不能很好地扩展,因此我希望可以大大提高其性能。目前,该代码未正确利用广播进行计算。
我正在使用WGCNA的bicor函数作为黄金标准,因为这是我目前所知道的最快的实现。Python版本输出的结果与R函数相同。
# ==============================================================================
# Imports
# ==============================================================================
# Built-ins
import os, sys, time, multiprocessing
# 3rd party
import numpy as np
import pandas as pd
# ==============================================================================
# R Imports
# ==============================================================================
from rpy2 import robjects, rinterface
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
pandas2ri.activate()
R = robjects.r
NULL = robjects.rinterface.NULL
rinterface.set_writeconsole_regular(None)
WGCNA = importr("WGCNA")
# Python
def _biweight_midcorrelation(a, b):
a_median = np.median(a)
b_median = np.median(b)
# Median absolute deviation
a_mad = np.median(np.abs(a - a_median))
b_mad = np.median(np.abs(b - b_median))
u = (a - a_median) / (9 * a_mad)
v = (b - b_median) / (9 * b_mad)
w_a = np.square(1 - np.square(u)) * ((1 - np.abs(u)) > 0)
w_b = np.square(1 - np.square(v)) * ((1 - np.abs(v)) > 0)
a_item = (a - a_median) * w_a
b_item = (b - b_median) * w_b
return (a_item * b_item).sum() / (
np.sqrt(np.square(a_item).sum()) *
np.sqrt(np.square(b_item).sum()))
def biweight_midcorrelation(X):
return X.corr(method=_biweight_midcorrelation)
# # OLD IMPLEMENTATION
# def biweight_midcorrelation(X):
# median = X.median()
# mad = (X - median).abs().median()
# U = (X - median) / (9 * mad)
# adjacency = np.square(1 - np.square(U)) * ((1 - U.abs()) > 0)
# estimator = (X - median) * adjacency
# bicor_matrix = np.empty((X.shape[1], X.shape[1]), dtype=float)
# for i, ac in enumerate(estimator):
# for j, bc in enumerate(estimator):
# a = estimator[ac]
# b = estimator[bc]
# c = (a * b).sum() / (
# np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()))
# bicor_matrix[i, j] = c
# bicor_matrix[j, i] = c
# return pd.DataFrame(bicor_matrix, index=X.columns, columns=X.columns)
# R
def biweight_midcorrelation_r_wrapper(X, n_jobs=-1, r_package=None):
"""
WGCNA: bicor
function (x, y = NULL, robustX = TRUE, robustY = TRUE, use = "all.obs",
maxPOutliers = 1, qu <...> dian absolute deviation, or zero variance."))
"""
if r_package is None:
r_package = importr("WGCNA")
if n_jobs == -1:
n_jobs = multiprocessing.cpu_count()
labels = X.columns
r_df_sim = r_package.bicor(pandas2ri.py2ri(X), nThreads=n_jobs)
df_bicor = pd.DataFrame(pandas2ri.ri2py(r_df_sim), index=labels, columns=labels)
return df_bicor
# X.shape = (150,4)
X = pd.DataFrame({'sepal_length': {'iris_0': 5.1, 'iris_1': 4.9, 'iris_2': 4.7, 'iris_3': 4.6, 'iris_4': 5.0, 'iris_5': 5.4, 'iris_6': 4.6, 'iris_7': 5.0, 'iris_8': 4.4, 'iris_9': 4.9, 'iris_10': 5.4, 'iris_11': 4.8, 'iris_12': 4.8, 'iris_13': 4.3, 'iris_14': 5.8, 'iris_15': 5.7, 'iris_16': 5.4, 'iris_17': 5.1, 'iris_18': 5.7, 'iris_19': 5.1, 'iris_20': 5.4, 'iris_21': 5.1, 'iris_22': 4.6, 'iris_23': 5.1, 'iris_24': 4.8, 'iris_25': 5.0, 'iris_26': 5.0, 'iris_27': 5.2, 'iris_28': 5.2, 'iris_29': 4.7, 'iris_30': 4.8, 'iris_31': 5.4, 'iris_32': 5.2, 'iris_33': 5.5, 'iris_34': 4.9, 'iris_35': 5.0, 'iris_36': 5.5, 'iris_37': 4.9, 'iris_38': 4.4, 'iris_39': 5.1, 'iris_40': 5.0, 'iris_41': 4.5, 'iris_42': 4.4, 'iris_43': 5.0, 'iris_44': 5.1, 'iris_45': 4.8, 'iris_46': 5.1, 'iris_47': 4.6, 'iris_48': 5.3, 'iris_49': 5.0, 'iris_50': 7.0, 'iris_51': 6.4, 'iris_52': 6.9, 'iris_53': 5.5, 'iris_54': 6.5, 'iris_55': 5.7, 'iris_56': 6.3, 'iris_57': 4.9, 'iris_58': 6.6, 'iris_59': 5.2, 'iris_60': 5.0, 'iris_61': 5.9, 'iris_62': 6.0, 'iris_63': 6.1, 'iris_64': 5.6, 'iris_65': 6.7, 'iris_66': 5.6, 'iris_67': 5.8, 'iris_68': 6.2, 'iris_69': 5.6, 'iris_70': 5.9, 'iris_71': 6.1, 'iris_72': 6.3, 'iris_73': 6.1, 'iris_74': 6.4, 'iris_75': 6.6, 'iris_76': 6.8, 'iris_77': 6.7, 'iris_78': 6.0, 'iris_79': 5.7, 'iris_80': 5.5, 'iris_81': 5.5, 'iris_82': 5.8, 'iris_83': 6.0, 'iris_84': 5.4, 'iris_85': 6.0, 'iris_86': 6.7, 'iris_87': 6.3, 'iris_88': 5.6, 'iris_89': 5.5, 'iris_90': 5.5, 'iris_91': 6.1, 'iris_92': 5.8, 'iris_93': 5.0, 'iris_94': 5.6, 'iris_95': 5.7, 'iris_96': 5.7, 'iris_97': 6.2, 'iris_98': 5.1, 'iris_99': 5.7, 'iris_100': 6.3, 'iris_101': 5.8, 'iris_102': 7.1, 'iris_103': 6.3, 'iris_104': 6.5, 'iris_105': 7.6, 'iris_106': 4.9, 'iris_107': 7.3, 'iris_108': 6.7, 'iris_109': 7.2, 'iris_110': 6.5, 'iris_111': 6.4, 'iris_112': 6.8, 'iris_113': 5.7, 'iris_114': 5.8, 'iris_115': 6.4, 'iris_116': 6.5, 'iris_117': 7.7, 'iris_118': 7.7, 'iris_119': 6.0, 'iris_120': 6.9, 'iris_121': 5.6, 'iris_122': 7.7, 'iris_123': 6.3, 'iris_124': 6.7, 'iris_125': 7.2, 'iris_126': 6.2, 'iris_127': 6.1, 'iris_128': 6.4, 'iris_129': 7.2, 'iris_130': 7.4, 'iris_131': 7.9, 'iris_132': 6.4, 'iris_133': 6.3, 'iris_134': 6.1, 'iris_135': 7.7, 'iris_136': 6.3, 'iris_137': 6.4, 'iris_138': 6.0, 'iris_139': 6.9, 'iris_140': 6.7, 'iris_141': 6.9, 'iris_142': 5.8, 'iris_143': 6.8, 'iris_144': 6.7, 'iris_145': 6.7, 'iris_146': 6.3, 'iris_147': 6.5, 'iris_148': 6.2, 'iris_149': 5.9}, 'sepal_width': {'iris_0': 3.5, 'iris_1': 3.0, 'iris_2': 3.2, 'iris_3': 3.1, 'iris_4': 3.6, 'iris_5': 3.9, 'iris_6': 3.4, 'iris_7': 3.4, 'iris_8': 2.9, 'iris_9': 3.1, 'iris_10': 3.7, 'iris_11': 3.4, 'iris_12': 3.0, 'iris_13': 3.0, 'iris_14': 4.0, 'iris_15': 4.4, 'iris_16': 3.9, 'iris_17': 3.5, 'iris_18': 3.8, 'iris_19': 3.8, 'iris_20': 3.4, 'iris_21': 3.7, 'iris_22': 3.6, 'iris_23': 3.3, 'iris_24': 3.4, 'iris_25': 3.0, 'iris_26': 3.4, 'iris_27': 3.5, 'iris_28': 3.4, 'iris_29': 3.2, 'iris_30': 3.1, 'iris_31': 3.4, 'iris_32': 4.1, 'iris_33': 4.2, 'iris_34': 3.1, 'iris_35': 3.2, 'iris_36': 3.5, 'iris_37': 3.6, 'iris_38': 3.0, 'iris_39': 3.4, 'iris_40': 3.5, 'iris_41': 2.3, 'iris_42': 3.2, 'iris_43': 3.5, 'iris_44': 3.8, 'iris_45': 3.0, 'iris_46': 3.8, 'iris_47': 3.2, 'iris_48': 3.7, 'iris_49': 3.3, 'iris_50': 3.2, 'iris_51': 3.2, 'iris_52': 3.1, 'iris_53': 2.3, 'iris_54': 2.8, 'iris_55': 2.8, 'iris_56': 3.3, 'iris_57': 2.4, 'iris_58': 2.9, 'iris_59': 2.7, 'iris_60': 2.0, 'iris_61': 3.0, 'iris_62': 2.2, 'iris_63': 2.9, 'iris_64': 2.9, 'iris_65': 3.1, 'iris_66': 3.0, 'iris_67': 2.7, 'iris_68': 2.2, 'iris_69': 2.5, 'iris_70': 3.2, 'iris_71': 2.8, 'iris_72': 2.5, 'iris_73': 2.8, 'iris_74': 2.9, 'iris_75': 3.0, 'iris_76': 2.8, 'iris_77': 3.0, 'iris_78': 2.9, 'iris_79': 2.6, 'iris_80': 2.4, 'iris_81': 2.4, 'iris_82': 2.7, 'iris_83': 2.7, 'iris_84': 3.0, 'iris_85': 3.4, 'iris_86': 3.1, 'iris_87': 2.3, 'iris_88': 3.0, 'iris_89': 2.5, 'iris_90': 2.6, 'iris_91': 3.0, 'iris_92': 2.6, 'iris_93': 2.3, 'iris_94': 2.7, 'iris_95': 3.0, 'iris_96': 2.9, 'iris_97': 2.9, 'iris_98': 2.5, 'iris_99': 2.8, 'iris_100': 3.3, 'iris_101': 2.7, 'iris_102': 3.0, 'iris_103': 2.9, 'iris_104': 3.0, 'iris_105': 3.0, 'iris_106': 2.5, 'iris_107': 2.9, 'iris_108': 2.5, 'iris_109': 3.6, 'iris_110': 3.2, 'iris_111': 2.7, 'iris_112': 3.0, 'iris_113': 2.5, 'iris_114': 2.8, 'iris_115': 3.2, 'iris_116': 3.0, 'iris_117': 3.8, 'iris_118': 2.6, 'iris_119': 2.2, 'iris_120': 3.2, 'iris_121': 2.8, 'iris_122': 2.8, 'iris_123': 2.7, 'iris_124': 3.3, 'iris_125': 3.2, 'iris_126': 2.8, 'iris_127': 3.0, 'iris_128': 2.8, 'iris_129': 3.0, 'iris_130': 2.8, 'iris_131': 3.8, 'iris_132': 2.8, 'iris_133': 2.8, 'iris_134': 2.6, 'iris_135': 3.0, 'iris_136': 3.4, 'iris_137': 3.1, 'iris_138': 3.0, 'iris_139': 3.1, 'iris_140': 3.1, 'iris_141': 3.1, 'iris_142': 2.7, 'iris_143': 3.2, 'iris_144': 3.3, 'iris_145': 3.0, 'iris_146': 2.5, 'iris_147': 3.0, 'iris_148': 3.4, 'iris_149': 3.0}, 'petal_length': {'iris_0': 1.4, 'iris_1': 1.4, 'iris_2': 1.3, 'iris_3': 1.5, 'iris_4': 1.4, 'iris_5': 1.7, 'iris_6': 1.4, 'iris_7': 1.5, 'iris_8': 1.4, 'iris_9': 1.5, 'iris_10': 1.5, 'iris_11': 1.6, 'iris_12': 1.4, 'iris_13': 1.1, 'iris_14': 1.2, 'iris_15': 1.5, 'iris_16': 1.3, 'iris_17': 1.4, 'iris_18': 1.7, 'iris_19': 1.5, 'iris_20': 1.7, 'iris_21': 1.5, 'iris_22': 1.0, 'iris_23': 1.7, 'iris_24': 1.9, 'iris_25': 1.6, 'iris_26': 1.6, 'iris_27': 1.5, 'iris_28': 1.4, 'iris_29': 1.6, 'iris_30': 1.6, 'iris_31': 1.5, 'iris_32': 1.5, 'iris_33': 1.4, 'iris_34': 1.5, 'iris_35': 1.2, 'iris_36': 1.3, 'iris_37': 1.4, 'iris_38': 1.3, 'iris_39': 1.5, 'iris_40': 1.3, 'iris_41': 1.3, 'iris_42': 1.3, 'iris_43': 1.6, 'iris_44': 1.9, 'iris_45': 1.4, 'iris_46': 1.6, 'iris_47': 1.4, 'iris_48': 1.5, 'iris_49': 1.4, 'iris_50': 4.7, 'iris_51': 4.5, 'iris_52': 4.9, 'iris_53': 4.0, 'iris_54': 4.6, 'iris_55': 4.5, 'iris_56': 4.7, 'iris_57': 3.3, 'iris_58': 4.6, 'iris_59': 3.9, 'iris_60': 3.5, 'iris_61': 4.2, 'iris_62': 4.0, 'iris_63': 4.7, 'iris_64': 3.6, 'iris_65': 4.4, 'iris_66': 4.5, 'iris_67': 4.1, 'iris_68': 4.5, 'iris_69': 3.9, 'iris_70': 4.8, 'iris_71': 4.0, 'iris_72': 4.9, 'iris_73': 4.7, 'iris_74': 4.3, 'iris_75': 4.4, 'iris_76': 4.8, 'iris_77': 5.0, 'iris_78': 4.5, 'iris_79': 3.5, 'iris_80': 3.8, 'iris_81': 3.7, 'iris_82': 3.9, 'iris_83': 5.1, 'iris_84': 4.5, 'iris_85': 4.5, 'iris_86': 4.7, 'iris_87': 4.4, 'iris_88': 4.1, 'iris_89': 4.0, 'iris_90': 4.4, 'iris_91': 4.6, 'iris_92': 4.0, 'iris_93': 3.3, 'iris_94': 4.2, 'iris_95': 4.2, 'iris_96': 4.2, 'iris_97': 4.3, 'iris_98': 3.0, 'iris_99': 4.1, 'iris_100': 6.0, 'iris_101': 5.1, 'iris_102': 5.9, 'iris_103': 5.6, 'iris_104': 5.8, 'iris_105': 6.6, 'iris_106': 4.5, 'iris_107': 6.3, 'iris_108': 5.8, 'iris_109': 6.1, 'iris_110': 5.1, 'iris_111': 5.3, 'iris_112': 5.5, 'iris_113': 5.0, 'iris_114': 5.1, 'iris_115': 5.3, 'iris_116': 5.5, 'iris_117': 6.7, 'iris_118': 6.9, 'iris_119': 5.0, 'iris_120': 5.7, 'iris_121': 4.9, 'iris_122': 6.7, 'iris_123': 4.9, 'iris_124': 5.7, 'iris_125': 6.0, 'iris_126': 4.8, 'iris_127': 4.9, 'iris_128': 5.6, 'iris_129': 5.8, 'iris_130': 6.1, 'iris_131': 6.4, 'iris_132': 5.6, 'iris_133': 5.1, 'iris_134': 5.6, 'iris_135': 6.1, 'iris_136': 5.6, 'iris_137': 5.5, 'iris_138': 4.8, 'iris_139': 5.4, 'iris_140': 5.6, 'iris_141': 5.1, 'iris_142': 5.1, 'iris_143': 5.9, 'iris_144': 5.7, 'iris_145': 5.2, 'iris_146': 5.0, 'iris_147': 5.2, 'iris_148': 5.4, 'iris_149': 5.1}, 'petal_width': {'iris_0': 0.2, 'iris_1': 0.2, 'iris_2': 0.2, 'iris_3': 0.2, 'iris_4': 0.2, 'iris_5': 0.4, 'iris_6': 0.3, 'iris_7': 0.2, 'iris_8': 0.2, 'iris_9': 0.1, 'iris_10': 0.2, 'iris_11': 0.2, 'iris_12': 0.1, 'iris_13': 0.1, 'iris_14': 0.2, 'iris_15': 0.4, 'iris_16': 0.4, 'iris_17': 0.3, 'iris_18': 0.3, 'iris_19': 0.3, 'iris_20': 0.2, 'iris_21': 0.4, 'iris_22': 0.2, 'iris_23': 0.5, 'iris_24': 0.2, 'iris_25': 0.2, 'iris_26': 0.4, 'iris_27': 0.2, 'iris_28': 0.2, 'iris_29': 0.2, 'iris_30': 0.2, 'iris_31': 0.4, 'iris_32': 0.1, 'iris_33': 0.2, 'iris_34': 0.2, 'iris_35': 0.2, 'iris_36': 0.2, 'iris_37': 0.1, 'iris_38': 0.2, 'iris_39': 0.2, 'iris_40': 0.3, 'iris_41': 0.3, 'iris_42': 0.2, 'iris_43': 0.6, 'iris_44': 0.4, 'iris_45': 0.3, 'iris_46': 0.2, 'iris_47': 0.2, 'iris_48': 0.2, 'iris_49': 0.2, 'iris_50': 1.4, 'iris_51': 1.5, 'iris_52': 1.5, 'iris_53': 1.3, 'iris_54': 1.5, 'iris_55': 1.3, 'iris_56': 1.6, 'iris_57': 1.0, 'iris_58': 1.3, 'iris_59': 1.4, 'iris_60': 1.0, 'iris_61': 1.5, 'iris_62': 1.0, 'iris_63': 1.4, 'iris_64': 1.3, 'iris_65': 1.4, 'iris_66': 1.5, 'iris_67': 1.0, 'iris_68': 1.5, 'iris_69': 1.1, 'iris_70': 1.8, 'iris_71': 1.3, 'iris_72': 1.5, 'iris_73': 1.2, 'iris_74': 1.3, 'iris_75': 1.4, 'iris_76': 1.4, 'iris_77': 1.7, 'iris_78': 1.5, 'iris_79': 1.0, 'iris_80': 1.1, 'iris_81': 1.0, 'iris_82': 1.2, 'iris_83': 1.6, 'iris_84': 1.5, 'iris_85': 1.6, 'iris_86': 1.5, 'iris_87': 1.3, 'iris_88': 1.3, 'iris_89': 1.3, 'iris_90': 1.2, 'iris_91': 1.4, 'iris_92': 1.2, 'iris_93': 1.0, 'iris_94': 1.3, 'iris_95': 1.2, 'iris_96': 1.3, 'iris_97': 1.3, 'iris_98': 1.1, 'iris_99': 1.3, 'iris_100': 2.5, 'iris_101': 1.9, 'iris_102': 2.1, 'iris_103': 1.8, 'iris_104': 2.2, 'iris_105': 2.1, 'iris_106': 1.7, 'iris_107': 1.8, 'iris_108': 1.8, 'iris_109': 2.5, 'iris_110': 2.0, 'iris_111': 1.9, 'iris_112': 2.1, 'iris_113': 2.0, 'iris_114': 2.4, 'iris_115': 2.3, 'iris_116': 1.8, 'iris_117': 2.2, 'iris_118': 2.3, 'iris_119': 1.5, 'iris_120': 2.3, 'iris_121': 2.0, 'iris_122': 2.0, 'iris_123': 1.8, 'iris_124': 2.1, 'iris_125': 1.8, 'iris_126': 1.8, 'iris_127': 1.8, 'iris_128': 2.1, 'iris_129': 1.6, 'iris_130': 1.9, 'iris_131': 2.0, 'iris_132': 2.2, 'iris_133': 1.5, 'iris_134': 1.4, 'iris_135': 2.3, 'iris_136': 2.4, 'iris_137': 1.8, 'iris_138': 1.8, 'iris_139': 2.1, 'iris_140': 2.4, 'iris_141': 2.3, 'iris_142': 1.9, 'iris_143': 2.3, 'iris_144': 2.5, 'iris_145': 2.3, 'iris_146': 1.9, 'iris_147': 2.0, 'iris_148': 2.3, 'iris_149': 1.8}})
# Python computation
start_time = time.time()
df_bicor__python = biweight_midcorrelation(X)
# R computation
df_bicor__r = biweight_midcorrelation_r_wrapper(X)
np.allclose(df_bicor__python, df_bicor__r)
一个人可以写这个计算约。使用以下命令,速度快一个数量级(对于您指定的输入):
import numpy as np
def biweight_midcorrelation(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
return np.einsum('mi,mj->ij', arr, arr) / norms[:, None] / norms[None, :]
通过以下方式桥接到Pandas数据框:
import pandas as pd
def corr_np2pd(df, func):
return pd.DataFrame(func(np.array(df)), index=df.columns, columns=df.columns)
其用法是:
corr_df = corr_np2pd(df, biweight_midcorrelation)
通过使用Numba进行最后的计算,可以使速度更快。
我不太确定为什么您希望广播在当前代码中有所帮助。您可能是指向量化吗?无论如何,我相信可以编写更快的代码,并且您的“旧”方法的矢量化版本将胜过当前方法。使用Numba可以使速度更快。
有两种实用的方法可以解决您的问题:
pd.DataFrame.corr()
在进行(1)时,如果不计算相关矩阵的不必要部分,可能无法避免显式循环。
当进行(2)中,这将是必要的计算计算每个(对称)对一个维输入(的辅助值2 * comb(n, 2)
倍),而不是计算辅助值仅一次为每个1D输入(n
次) 。例如,对于问题中指定的输入,将需要执行n == 4
预计算,但是,如果以成对方式进行,则该数字变为2 * comb(4, 2) == 12
。
让我们看看如何在两种情况下都能提高性能。
让我们首先定义一个函数以充当Pandas到NumPy的桥梁:
import numpy as np
import pandas as pd
def corr_np2pd(df, func):
return pd.DataFrame(func(np.array(df)), index=df.columns, columns=df.columns)
现在,注释中带有显式循环的函数属于此类别,下面将其报告为biweight_midcorrelation_pd_OP()
:
def biweight_midcorrelation_pd_OP(X):
median = X.median()
mad = (X - median).abs().median()
U = (X - median) / (9 * mad)
adjacency = np.square(1 - np.square(U)) * ((1 - U.abs()) > 0)
estimator = (X - median) * adjacency
bicor_matrix = np.empty((X.shape[1], X.shape[1]), dtype=float)
for i, ac in enumerate(estimator):
for j, bc in enumerate(estimator):
a = estimator[ac]
b = estimator[bc]
c = (a * b).sum() / (
np.sqrt(np.square(a).sum()) * np.sqrt(np.square(b).sum()))
bicor_matrix[i, j] = c
bicor_matrix[j, i] = c
return pd.DataFrame(bicor_matrix, index=X.columns, columns=X.columns)
对此进行了稍微修改的版本,其中计算完全在NumPy中完成,应与一起使用corr_np2pd()
,内容为:
def biweight_midcorrelation_OP(arr):
n, m = arr.shape
med = np.median(arr, axis=0, keepdims=True)
mad = np.median(np.abs(arr - med), axis=0, keepdims=True)
u = (arr - med) / (9 * mad)
adj = ((1 - u ** 2) ** 2) * ((1 - np.abs(u)) > 0)
est = (arr - med) * adj
result = np.empty((m, m))
for i in range(m):
for j in range(m):
a = est[:, i]
b = est[:, j]
c = (a * b).sum() / (
np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
result[i, j] = result[j, i] = c
return result
现在,这有一些改进点:
最后一点可以通过以下两种方法进行改进:
biweight_midcorrelation_np()
biweight_midcorrelation_npv()
def biweight_midcorrelation_np(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
result = np.empty((m, m))
np.fill_diagonal(result, 1.0)
for i, j in zip(*np.triu_indices(m, 1)):
result[i, j] = result[j, i] = \
np.sum(arr[:, i] * arr[:, j]) / norms[i] / norms[j]
return result
def biweight_midcorrelation_npv(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
return np.einsum('mi,mj->ij', arr, arr) / norms[:, None] / norms[None, :]
m
由于显式循环,第一个只要很小就将很快。第二个通常会很快,但是两次计算矩阵的某些项似乎效率不高。为了克服这两个问题,可以使用Numba重写最后一个循环:
import numba as nb
@nb.jit
def _biweight_midcorrelation_triu_nb(n, m, est, norms, result):
for i in range(m):
for j in range(i + 1, m):
x = 0
for k in range(n):
x += est[k, i] * est[k, j]
result[i, j] = result[j, i] = x / norms[i] / norms[j]
def biweight_midcorrelation_nb(arr):
n, m = arr.shape
arr = arr - np.median(arr, axis=0, keepdims=True)
v = 1 - (arr / (9 * np.median(np.abs(arr), axis=0, keepdims=True))) ** 2
arr = arr * v ** 2 * (v > 0)
norms = np.sqrt(np.sum(arr ** 2, axis=0))
result = np.empty((m, m))
np.fill_diagonal(result, 1.0)
_biweight_midcorrelation_triu_nb(n, m, arr, norms, result)
return result
您现在提议的方法的稍作修改的版本属于以下类别:
def pairwise_biweight_midcorrelation_OP(a, b):
a_median = np.median(a)
b_median = np.median(b)
a_mad = np.median(np.abs(a - a_median))
b_mad = np.median(np.abs(b - b_median))
u_a = (a - a_median) / (9 * a_mad)
u_b = (b - b_median) / (9 * b_mad)
adj_a = (1 - u_a ** 2) ** 2 * ((1 - np.abs(u_a)) > 0)
adj_b = (1 - u_b ** 2) ** 2 * ((1 - np.abs(u_b)) > 0)
a = (a - a_median) * adj_a
b = (b - b_median) * adj_b
return np.sum(a * b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
使用上面类似的简化,可以使代码更简洁一些,导致:
def pairwise_biweight_midcorrelation_opt(a, b):
a = a - np.median(a)
b = b - np.median(b)
v_a = 1 - (a / (9 * np.median(np.abs(a)))) ** 2
v_b = 1 - (b / (9 * np.median(np.abs(b)))) ** 2
a = a * v_a ** 2 * (v_a > 0)
b = b * v_b ** 2 * (v_b > 0)
return np.sum(a * b) / (np.sqrt(np.sum(a ** 2)) * np.sqrt(np.sum(b ** 2)))
最后的操作是在执行求和a
,并b
三次,但它实际上在一个循环,这可能与Numba再次做出快速完成:
@nb.jit
def pairwise_biweight_midcorrelation_nb(a, b):
n = a.size
a = a - np.median(a)
b = b - np.median(b)
v_a = 1 - (a / (9 * np.median(np.abs(a)))) ** 2
v_b = 1 - (b / (9 * np.median(np.abs(b)))) ** 2
a = (v_a > 0) * a * v_a ** 2
b = (v_b > 0) * b * v_b ** 2
s_ab = s_aa = s_bb = 0
for i in range(n):
s_ab += a[i] * b[i]
s_aa += a[i] * a[i]
s_bb += b[i] * b[i]
return s_ab / np.sqrt(s_aa) / np.sqrt(s_bb)
但是没有避免执行预计算2 * comb(n, 2)
时间而不是n
时间的简单方法。故事的另一面是,此类方法需要较少的内存,因为每次迭代仅考虑两个一维数组。
对于建议的输入:
import pandas as pd
df = pd.DataFrame({'sepal_length': {'iris_0': 5.1, 'iris_1': 4.9, 'iris_2': 4.7, 'iris_3': 4.6, 'iris_4': 5.0, 'iris_5': 5.4, 'iris_6': 4.6, 'iris_7': 5.0, 'iris_8': 4.4, 'iris_9': 4.9, 'iris_10': 5.4, 'iris_11': 4.8, 'iris_12': 4.8, 'iris_13': 4.3, 'iris_14': 5.8, 'iris_15': 5.7, 'iris_16': 5.4, 'iris_17': 5.1, 'iris_18': 5.7, 'iris_19': 5.1, 'iris_20': 5.4, 'iris_21': 5.1, 'iris_22': 4.6, 'iris_23': 5.1, 'iris_24': 4.8, 'iris_25': 5.0, 'iris_26': 5.0, 'iris_27': 5.2, 'iris_28': 5.2, 'iris_29': 4.7, 'iris_30': 4.8, 'iris_31': 5.4, 'iris_32': 5.2, 'iris_33': 5.5, 'iris_34': 4.9, 'iris_35': 5.0, 'iris_36': 5.5, 'iris_37': 4.9, 'iris_38': 4.4, 'iris_39': 5.1, 'iris_40': 5.0, 'iris_41': 4.5, 'iris_42': 4.4, 'iris_43': 5.0, 'iris_44': 5.1, 'iris_45': 4.8, 'iris_46': 5.1, 'iris_47': 4.6, 'iris_48': 5.3, 'iris_49': 5.0, 'iris_50': 7.0, 'iris_51': 6.4, 'iris_52': 6.9, 'iris_53': 5.5, 'iris_54': 6.5, 'iris_55': 5.7, 'iris_56': 6.3, 'iris_57': 4.9, 'iris_58': 6.6, 'iris_59': 5.2, 'iris_60': 5.0, 'iris_61': 5.9, 'iris_62': 6.0, 'iris_63': 6.1, 'iris_64': 5.6, 'iris_65': 6.7, 'iris_66': 5.6, 'iris_67': 5.8, 'iris_68': 6.2, 'iris_69': 5.6, 'iris_70': 5.9, 'iris_71': 6.1, 'iris_72': 6.3, 'iris_73': 6.1, 'iris_74': 6.4, 'iris_75': 6.6, 'iris_76': 6.8, 'iris_77': 6.7, 'iris_78': 6.0, 'iris_79': 5.7, 'iris_80': 5.5, 'iris_81': 5.5, 'iris_82': 5.8, 'iris_83': 6.0, 'iris_84': 5.4, 'iris_85': 6.0, 'iris_86': 6.7, 'iris_87': 6.3, 'iris_88': 5.6, 'iris_89': 5.5, 'iris_90': 5.5, 'iris_91': 6.1, 'iris_92': 5.8, 'iris_93': 5.0, 'iris_94': 5.6, 'iris_95': 5.7, 'iris_96': 5.7, 'iris_97': 6.2, 'iris_98': 5.1, 'iris_99': 5.7, 'iris_100': 6.3, 'iris_101': 5.8, 'iris_102': 7.1, 'iris_103': 6.3, 'iris_104': 6.5, 'iris_105': 7.6, 'iris_106': 4.9, 'iris_107': 7.3, 'iris_108': 6.7, 'iris_109': 7.2, 'iris_110': 6.5, 'iris_111': 6.4, 'iris_112': 6.8, 'iris_113': 5.7, 'iris_114': 5.8, 'iris_115': 6.4, 'iris_116': 6.5, 'iris_117': 7.7, 'iris_118': 7.7, 'iris_119': 6.0, 'iris_120': 6.9, 'iris_121': 5.6, 'iris_122': 7.7, 'iris_123': 6.3, 'iris_124': 6.7, 'iris_125': 7.2, 'iris_126': 6.2, 'iris_127': 6.1, 'iris_128': 6.4, 'iris_129': 7.2, 'iris_130': 7.4, 'iris_131': 7.9, 'iris_132': 6.4, 'iris_133': 6.3, 'iris_134': 6.1, 'iris_135': 7.7, 'iris_136': 6.3, 'iris_137': 6.4, 'iris_138': 6.0, 'iris_139': 6.9, 'iris_140': 6.7, 'iris_141': 6.9, 'iris_142': 5.8, 'iris_143': 6.8, 'iris_144': 6.7, 'iris_145': 6.7, 'iris_146': 6.3, 'iris_147': 6.5, 'iris_148': 6.2, 'iris_149': 5.9}, 'sepal_width': {'iris_0': 3.5, 'iris_1': 3.0, 'iris_2': 3.2, 'iris_3': 3.1, 'iris_4': 3.6, 'iris_5': 3.9, 'iris_6': 3.4, 'iris_7': 3.4, 'iris_8': 2.9, 'iris_9': 3.1, 'iris_10': 3.7, 'iris_11': 3.4, 'iris_12': 3.0, 'iris_13': 3.0, 'iris_14': 4.0, 'iris_15': 4.4, 'iris_16': 3.9, 'iris_17': 3.5, 'iris_18': 3.8, 'iris_19': 3.8, 'iris_20': 3.4, 'iris_21': 3.7, 'iris_22': 3.6, 'iris_23': 3.3, 'iris_24': 3.4, 'iris_25': 3.0, 'iris_26': 3.4, 'iris_27': 3.5, 'iris_28': 3.4, 'iris_29': 3.2, 'iris_30': 3.1, 'iris_31': 3.4, 'iris_32': 4.1, 'iris_33': 4.2, 'iris_34': 3.1, 'iris_35': 3.2, 'iris_36': 3.5, 'iris_37': 3.6, 'iris_38': 3.0, 'iris_39': 3.4, 'iris_40': 3.5, 'iris_41': 2.3, 'iris_42': 3.2, 'iris_43': 3.5, 'iris_44': 3.8, 'iris_45': 3.0, 'iris_46': 3.8, 'iris_47': 3.2, 'iris_48': 3.7, 'iris_49': 3.3, 'iris_50': 3.2, 'iris_51': 3.2, 'iris_52': 3.1, 'iris_53': 2.3, 'iris_54': 2.8, 'iris_55': 2.8, 'iris_56': 3.3, 'iris_57': 2.4, 'iris_58': 2.9, 'iris_59': 2.7, 'iris_60': 2.0, 'iris_61': 3.0, 'iris_62': 2.2, 'iris_63': 2.9, 'iris_64': 2.9, 'iris_65': 3.1, 'iris_66': 3.0, 'iris_67': 2.7, 'iris_68': 2.2, 'iris_69': 2.5, 'iris_70': 3.2, 'iris_71': 2.8, 'iris_72': 2.5, 'iris_73': 2.8, 'iris_74': 2.9, 'iris_75': 3.0, 'iris_76': 2.8, 'iris_77': 3.0, 'iris_78': 2.9, 'iris_79': 2.6, 'iris_80': 2.4, 'iris_81': 2.4, 'iris_82': 2.7, 'iris_83': 2.7, 'iris_84': 3.0, 'iris_85': 3.4, 'iris_86': 3.1, 'iris_87': 2.3, 'iris_88': 3.0, 'iris_89': 2.5, 'iris_90': 2.6, 'iris_91': 3.0, 'iris_92': 2.6, 'iris_93': 2.3, 'iris_94': 2.7, 'iris_95': 3.0, 'iris_96': 2.9, 'iris_97': 2.9, 'iris_98': 2.5, 'iris_99': 2.8, 'iris_100': 3.3, 'iris_101': 2.7, 'iris_102': 3.0, 'iris_103': 2.9, 'iris_104': 3.0, 'iris_105': 3.0, 'iris_106': 2.5, 'iris_107': 2.9, 'iris_108': 2.5, 'iris_109': 3.6, 'iris_110': 3.2, 'iris_111': 2.7, 'iris_112': 3.0, 'iris_113': 2.5, 'iris_114': 2.8, 'iris_115': 3.2, 'iris_116': 3.0, 'iris_117': 3.8, 'iris_118': 2.6, 'iris_119': 2.2, 'iris_120': 3.2, 'iris_121': 2.8, 'iris_122': 2.8, 'iris_123': 2.7, 'iris_124': 3.3, 'iris_125': 3.2, 'iris_126': 2.8, 'iris_127': 3.0, 'iris_128': 2.8, 'iris_129': 3.0, 'iris_130': 2.8, 'iris_131': 3.8, 'iris_132': 2.8, 'iris_133': 2.8, 'iris_134': 2.6, 'iris_135': 3.0, 'iris_136': 3.4, 'iris_137': 3.1, 'iris_138': 3.0, 'iris_139': 3.1, 'iris_140': 3.1, 'iris_141': 3.1, 'iris_142': 2.7, 'iris_143': 3.2, 'iris_144': 3.3, 'iris_145': 3.0, 'iris_146': 2.5, 'iris_147': 3.0, 'iris_148': 3.4, 'iris_149': 3.0}, 'petal_length': {'iris_0': 1.4, 'iris_1': 1.4, 'iris_2': 1.3, 'iris_3': 1.5, 'iris_4': 1.4, 'iris_5': 1.7, 'iris_6': 1.4, 'iris_7': 1.5, 'iris_8': 1.4, 'iris_9': 1.5, 'iris_10': 1.5, 'iris_11': 1.6, 'iris_12': 1.4, 'iris_13': 1.1, 'iris_14': 1.2, 'iris_15': 1.5, 'iris_16': 1.3, 'iris_17': 1.4, 'iris_18': 1.7, 'iris_19': 1.5, 'iris_20': 1.7, 'iris_21': 1.5, 'iris_22': 1.0, 'iris_23': 1.7, 'iris_24': 1.9, 'iris_25': 1.6, 'iris_26': 1.6, 'iris_27': 1.5, 'iris_28': 1.4, 'iris_29': 1.6, 'iris_30': 1.6, 'iris_31': 1.5, 'iris_32': 1.5, 'iris_33': 1.4, 'iris_34': 1.5, 'iris_35': 1.2, 'iris_36': 1.3, 'iris_37': 1.4, 'iris_38': 1.3, 'iris_39': 1.5, 'iris_40': 1.3, 'iris_41': 1.3, 'iris_42': 1.3, 'iris_43': 1.6, 'iris_44': 1.9, 'iris_45': 1.4, 'iris_46': 1.6, 'iris_47': 1.4, 'iris_48': 1.5, 'iris_49': 1.4, 'iris_50': 4.7, 'iris_51': 4.5, 'iris_52': 4.9, 'iris_53': 4.0, 'iris_54': 4.6, 'iris_55': 4.5, 'iris_56': 4.7, 'iris_57': 3.3, 'iris_58': 4.6, 'iris_59': 3.9, 'iris_60': 3.5, 'iris_61': 4.2, 'iris_62': 4.0, 'iris_63': 4.7, 'iris_64': 3.6, 'iris_65': 4.4, 'iris_66': 4.5, 'iris_67': 4.1, 'iris_68': 4.5, 'iris_69': 3.9, 'iris_70': 4.8, 'iris_71': 4.0, 'iris_72': 4.9, 'iris_73': 4.7, 'iris_74': 4.3, 'iris_75': 4.4, 'iris_76': 4.8, 'iris_77': 5.0, 'iris_78': 4.5, 'iris_79': 3.5, 'iris_80': 3.8, 'iris_81': 3.7, 'iris_82': 3.9, 'iris_83': 5.1, 'iris_84': 4.5, 'iris_85': 4.5, 'iris_86': 4.7, 'iris_87': 4.4, 'iris_88': 4.1, 'iris_89': 4.0, 'iris_90': 4.4, 'iris_91': 4.6, 'iris_92': 4.0, 'iris_93': 3.3, 'iris_94': 4.2, 'iris_95': 4.2, 'iris_96': 4.2, 'iris_97': 4.3, 'iris_98': 3.0, 'iris_99': 4.1, 'iris_100': 6.0, 'iris_101': 5.1, 'iris_102': 5.9, 'iris_103': 5.6, 'iris_104': 5.8, 'iris_105': 6.6, 'iris_106': 4.5, 'iris_107': 6.3, 'iris_108': 5.8, 'iris_109': 6.1, 'iris_110': 5.1, 'iris_111': 5.3, 'iris_112': 5.5, 'iris_113': 5.0, 'iris_114': 5.1, 'iris_115': 5.3, 'iris_116': 5.5, 'iris_117': 6.7, 'iris_118': 6.9, 'iris_119': 5.0, 'iris_120': 5.7, 'iris_121': 4.9, 'iris_122': 6.7, 'iris_123': 4.9, 'iris_124': 5.7, 'iris_125': 6.0, 'iris_126': 4.8, 'iris_127': 4.9, 'iris_128': 5.6, 'iris_129': 5.8, 'iris_130': 6.1, 'iris_131': 6.4, 'iris_132': 5.6, 'iris_133': 5.1, 'iris_134': 5.6, 'iris_135': 6.1, 'iris_136': 5.6, 'iris_137': 5.5, 'iris_138': 4.8, 'iris_139': 5.4, 'iris_140': 5.6, 'iris_141': 5.1, 'iris_142': 5.1, 'iris_143': 5.9, 'iris_144': 5.7, 'iris_145': 5.2, 'iris_146': 5.0, 'iris_147': 5.2, 'iris_148': 5.4, 'iris_149': 5.1}, 'petal_width': {'iris_0': 0.2, 'iris_1': 0.2, 'iris_2': 0.2, 'iris_3': 0.2, 'iris_4': 0.2, 'iris_5': 0.4, 'iris_6': 0.3, 'iris_7': 0.2, 'iris_8': 0.2, 'iris_9': 0.1, 'iris_10': 0.2, 'iris_11': 0.2, 'iris_12': 0.1, 'iris_13': 0.1, 'iris_14': 0.2, 'iris_15': 0.4, 'iris_16': 0.4, 'iris_17': 0.3, 'iris_18': 0.3, 'iris_19': 0.3, 'iris_20': 0.2, 'iris_21': 0.4, 'iris_22': 0.2, 'iris_23': 0.5, 'iris_24': 0.2, 'iris_25': 0.2, 'iris_26': 0.4, 'iris_27': 0.2, 'iris_28': 0.2, 'iris_29': 0.2, 'iris_30': 0.2, 'iris_31': 0.4, 'iris_32': 0.1, 'iris_33': 0.2, 'iris_34': 0.2, 'iris_35': 0.2, 'iris_36': 0.2, 'iris_37': 0.1, 'iris_38': 0.2, 'iris_39': 0.2, 'iris_40': 0.3, 'iris_41': 0.3, 'iris_42': 0.2, 'iris_43': 0.6, 'iris_44': 0.4, 'iris_45': 0.3, 'iris_46': 0.2, 'iris_47': 0.2, 'iris_48': 0.2, 'iris_49': 0.2, 'iris_50': 1.4, 'iris_51': 1.5, 'iris_52': 1.5, 'iris_53': 1.3, 'iris_54': 1.5, 'iris_55': 1.3, 'iris_56': 1.6, 'iris_57': 1.0, 'iris_58': 1.3, 'iris_59': 1.4, 'iris_60': 1.0, 'iris_61': 1.5, 'iris_62': 1.0, 'iris_63': 1.4, 'iris_64': 1.3, 'iris_65': 1.4, 'iris_66': 1.5, 'iris_67': 1.0, 'iris_68': 1.5, 'iris_69': 1.1, 'iris_70': 1.8, 'iris_71': 1.3, 'iris_72': 1.5, 'iris_73': 1.2, 'iris_74': 1.3, 'iris_75': 1.4, 'iris_76': 1.4, 'iris_77': 1.7, 'iris_78': 1.5, 'iris_79': 1.0, 'iris_80': 1.1, 'iris_81': 1.0, 'iris_82': 1.2, 'iris_83': 1.6, 'iris_84': 1.5, 'iris_85': 1.6, 'iris_86': 1.5, 'iris_87': 1.3, 'iris_88': 1.3, 'iris_89': 1.3, 'iris_90': 1.2, 'iris_91': 1.4, 'iris_92': 1.2, 'iris_93': 1.0, 'iris_94': 1.3, 'iris_95': 1.2, 'iris_96': 1.3, 'iris_97': 1.3, 'iris_98': 1.1, 'iris_99': 1.3, 'iris_100': 2.5, 'iris_101': 1.9, 'iris_102': 2.1, 'iris_103': 1.8, 'iris_104': 2.2, 'iris_105': 2.1, 'iris_106': 1.7, 'iris_107': 1.8, 'iris_108': 1.8, 'iris_109': 2.5, 'iris_110': 2.0, 'iris_111': 1.9, 'iris_112': 2.1, 'iris_113': 2.0, 'iris_114': 2.4, 'iris_115': 2.3, 'iris_116': 1.8, 'iris_117': 2.2, 'iris_118': 2.3, 'iris_119': 1.5, 'iris_120': 2.3, 'iris_121': 2.0, 'iris_122': 2.0, 'iris_123': 1.8, 'iris_124': 2.1, 'iris_125': 1.8, 'iris_126': 1.8, 'iris_127': 1.8, 'iris_128': 2.1, 'iris_129': 1.6, 'iris_130': 1.9, 'iris_131': 2.0, 'iris_132': 2.2, 'iris_133': 1.5, 'iris_134': 1.4, 'iris_135': 2.3, 'iris_136': 2.4, 'iris_137': 1.8, 'iris_138': 1.8, 'iris_139': 2.1, 'iris_140': 2.4, 'iris_141': 2.3, 'iris_142': 1.9, 'iris_143': 2.3, 'iris_144': 2.5, 'iris_145': 2.3, 'iris_146': 1.9, 'iris_147': 2.0, 'iris_148': 2.3, 'iris_149': 1.8}})
我们获得:
print(np.all(np.isclose(biweight_midcorrelation_pd_OP(df), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_OP), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_np), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_npv), result)))
# True
print(np.all(np.isclose(corr_np2pd(df, biweight_midcorrelation_nb), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_OP), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_opt), result)))
# True
print(np.all(np.isclose(df.corr(method=pairwise_biweight_midcorrelation_nb), result)))
# True
%timeit biweight_midcorrelation_pd_OP(df)
# 10 loops, best of 3: 22.1 ms per loop
%timeit corr_np2pd(df, biweight_midcorrelation_OP)
# 1000 loops, best of 3: 682 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_np)
# 1000 loops, best of 3: 422 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_npv)
# 1000 loops, best of 3: 341 µs per loop
%timeit corr_np2pd(df, biweight_midcorrelation_nb)
# 1000 loops, best of 3: 325 µs per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_OP)
# 100 loops, best of 3: 1.96 ms per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_opt)
# 100 loops, best of 3: 1.83 ms per loop
%timeit df.corr(method=pairwise_biweight_midcorrelation_nb)
# 1000 loops, best of 3: 506 µs per loop
这些结果表明基于Numba的方法是最快的,紧随其后的是原始方法的NumPy矢量化版本。
请注意,从基于Pandas的计算过渡到基于NumPy的纯方法(甚至具有显式循环),我们得到的速度因子几乎是30倍。对两个for
循环进行矢量化处理后,我们可以得到另一个近似值。2倍
该pd.DataFrame.corr()
基础的方法(ES)是,不使用时Numba,约。比用NumPy重写的原始方法慢4倍,因此即使看不到显式循环也要小心!Numba加速器pairwise_biweight_midcorrelation_nb()
极大地促进了这一系列方法的实现,但它可能无法避免预计算的开销。
最后警告:所有这些基准都应加盐!
(经过编辑,以包含基于Numba的方法来使用pd.DataFrame.corr()
)。
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句