对不起,标题不好。请编辑以使其有意义。
下面有很多代码。别担心。这只是一个最小的例子。
我想要做的是按标签对数据进行分组,应用我的函数(检查给定标签的坐标是在椭圆内部还是外部)。这将返回一个与数据长度相同的真/假数组。-1
如果它在椭圆之外,我想将标签更改为。
玩弄apply
和transform
我能得到的最远是
label
1 [True, True, False, True, False, False, True, ...
2 [False, False, True, True, False, False, True,...
dtype: object
但是如何将其转换回原始数据帧,并将标签设置为 -1 为False
遇到的每个?
底部的注释位显示了它在没有标签的情况下是如何工作的。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
import pandas as pd
def _plot_ellipse(xdata, ydata, n_std, ax = None, return_ax = False, **kwargs):
"""
Parameters
----------
xdata : array-like
ydata : array-like
n_std : scalar
Number of sigmas (e.g. 2 for 95% confidence interval)
ax : ax to plot on
return_ax : bool
Returns axis for plot
return_inside : bool
Returns a list of True/False for inside/outside ellipse
**kwargs
Passed to matplotlib.patches.Ellipse. Color, alpha, etc..
Returns
-------
Ellipse with the correct orientation, given the data
Example
-------
x = np.random.randn(100)
y = 0.1 * x + np.random.randn(100)
fig, ax = plt.subplots()
ax, in_out = _plot_ellipse(x, y, n_std = 2, ax = ax, alpha = 0.5, return_ax = True)
ax.scatter(x, y, c = in_out)
plt.show()
"""
def _eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:, order]
points = np.stack([xdata, ydata], axis = 1) # Combine points to 2-column matrix
center = points.mean(axis = 0) # Calculate mean for every column (x,y)
# Calculate covariance matrix for coordinates (how correlated they are)
cov = np.cov(points, rowvar = False) # rowvar = False because there are 2 variables, not nrows variables
vals, vecs = _eigsorted(cov)
angle = np.degrees(np.arctan2(*vecs[:,0][::-1]))
width, height = 2 * n_std * np.sqrt(vals)
in_out = _is_in_ellipse(xdata = xdata, ydata = ydata, center = center, width = width, height = height, angle = angle)
if return_ax:
ellip = patches.Ellipse(xy = center, width = width, height = height, angle = angle, **kwargs)
if ax is None:
ax = plt.gca()
ax.add_artist(ellip)
return ax, in_out
else:
return in_out
def _is_in_ellipse(xdata, ydata, center, width, height, angle):
"""
Determines whether points are in ellipse, given the parameters of the ellipse
Parameters
----------
xdata : array-like
ydata : array-lie
center : array-like, tuple
center of the ellipse as (x,y)
width : scalar
height : scalar
angle : scalar
angle in degrees
Returns
-------
List of True/False, depending on points being inside/outside of the ellipse
"""
cos_angle = np.cos(np.radians(180-angle))
sin_angle = np.sin(np.radians(180-angle))
xc = xdata - center[0]
yc = ydata - center[1]
xct = xc * cos_angle - yc * sin_angle
yct = xc * sin_angle + yc * cos_angle
rad_cc = (xct**2/(width/2)**2) + (yct**2/(height/2)**2)
in_ellipse = []
for r in rad_cc:
in_ellipse.append(True) if r <= 1. else in_ellipse.append(False)
return in_ellipse
# For a single label
# x = np.random.normal(0, 1, 100)
# y = np.random.normal(0, 1, 100)
# labels = [1] * len(x)
#
# df = pd.DataFrame({"x" : x, "y" : y, "label" : labels})
#
# ax, in_out = _plot_ellipse(df.x, df.y, 2, return_ax = True, alpha = 0.5)
# ax.scatter(df.x, df.y, c = in_out)
# plt.show()
# For multiple labels
x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)
labels1 = [1] * 50
labels2 = [2] * 50
labels = labels1 + labels2
df = pd.DataFrame({"x" : x, "y" : y, "label" : labels})
df = df.groupby("label").apply(lambda group: _plot_ellipse(xdata = group["x"], ydata = group["y"], n_std = 1, return_ax = False))
print(df)
所以这是一种可行的方法,如果我这样做,我可能会重新考虑更多因素,但你会明白这个想法,你可以从那里开始。为简单起见,我已注释掉您的 return_ax 逻辑。
您不需要 lambda,groupby.apply
因为您已经将函数定义为_plot_ellipse
. 您可以传递apply
python callable 以及 kwargs (这些将传递给您的 callable)。
这条线看起来像
df = df.groupby("label").apply(_plot_ellipse, n_std = 1, return_ax = False)
在您的函数中,pandas 传递的第一个参数将是组。所以你不需要在你的函数参数中引用x
和y
变量。同样DataFrame
要从apply
函数中返回,DataFrame
在这种情况下您需要返回 a ,您将修改您的组,然后返回该组。传递的组获得一个name
从 Pandas(组名)调用的属性,在您的情况下,它只是标签。我将函数的第一行更改为此,以便可以保留相同的代码
xdata = grp.x
ydata = grp.y
label = grp.name
然后我修改了您的代码以_is_in_ellipse
传递标签,然后保留标签或将其更改为 -1。在我重新分配grp.label
成为结果之后
您修改后的完整示例如下。
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches
import pandas as pd
def _plot_ellipse(grp, n_std, ax = None, return_ax = False, **kwargs):
xdata = grp.x
ydata = grp.y
label = grp.name
def _eigsorted(cov):
vals, vecs = np.linalg.eigh(cov)
order = vals.argsort()[::-1]
return vals[order], vecs[:, order]
points = np.stack([xdata, ydata], axis = 1) # Combine points to 2-column matrix
center = points.mean(axis = 0) # Calculate mean for every column (x,y)
# Calculate covariance matrix for coordinates (how correlated they are)
cov = np.cov(points, rowvar = False) # rowvar = False because there are 2 variables, not nrows variables
vals, vecs = _eigsorted(cov)
angle = np.degrees(np.arctan2(*vecs[:,0][::-1]))
width, height = 2 * n_std * np.sqrt(vals)
in_out = _is_in_ellipse(label = label, xdata = xdata, ydata = ydata, center = center, width = width, height = height, angle = angle)
# if return_ax:
# ellip = patches.Ellipse(xy = center, width = width, height = height, angle = angle, **kwargs)
# if ax is None:
# ax = plt.gca()
# ax.add_artist(ellip)
# return ax, in_out
# else:
# return in_out
grp.label = in_out
return grp
def _is_in_ellipse(label, xdata, ydata, center, width, height, angle):
cos_angle = np.cos(np.radians(180-angle))
sin_angle = np.sin(np.radians(180-angle))
xc = xdata - center[0]
yc = ydata - center[1]
xct = xc * cos_angle - yc * sin_angle
yct = xc * sin_angle + yc * cos_angle
rad_cc = (xct**2/(width/2)**2) + (yct**2/(height/2)**2)
# in_ellipse = []
# for r in rad_cc:
# in_ellipse.append(True) if r <= 1. else in_ellipse.append(False)
return pd.Series(rad_cc).apply(lambda r: label if r <= 1. else -1)
# For a single label
# x = np.random.normal(0, 1, 100)
# y = np.random.normal(0, 1, 100)
# labels = [1] * len(x)
#
# df = pd.DataFrame({"x" : x, "y" : y, "label" : labels})
#
# ax, in_out = _plot_ellipse(df.x, df.y, 2, return_ax = True, alpha = 0.5)
# ax.scatter(df.x, df.y, c = in_out)
# plt.show()
# For multiple labels
x = np.random.normal(0, 1, 100)
y = np.random.normal(0, 1, 100)
labels1 = [1] * 50
labels2 = [2] * 50
labels = labels1 + labels2
df = pd.DataFrame({"x" : x, "y" : y, "label" : labels})
df = df.groupby("label").apply(_plot_ellipse, n_std = 1, return_ax = False)
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句