我有一個數據集,其中包含大量對象 ( unit
) 的時間序列列表,我需要將每個對象的列表與每個對象的第一個列表進行比較。為此,我一直在使用fuzzywuzzy
它的similarity
方法,但我並沒有真正將所有後續實例(列表)與每個對象的第一個實例進行比較。為了讓這個更容易理解,讓我們看看我到目前為止取得的成就。注意:我是fuzzywuzzy
.
我的數據框是以下形式:
data = {'unit': {59: 'unit1',
662: 'unit1',
680: 'unit1',
725: 'unit1',
709: 'unit1',
703: 'unit1',
653: 'unit1',
807: 'unit4',
825: 'unit4',
778: 'unit4',
816: 'unit4',
822: 'unit4',
849: 'unit4',
820: 'unit4',
754: 'unit4',
1031: 'unit3',
1094: 'unit2',
1008: 'unit2',
1089: 'unit2',
1044: 'unit5'},
'Date_job': {59: datetime.date(2021, 6, 7),
662: datetime.date(2021, 6, 14),
680: datetime.date(2021, 7, 5),
725: datetime.date(2021, 7, 26),
709: datetime.date(2021, 8, 30),
703: datetime.date(2021, 10, 11),
653: datetime.date(2021, 10, 18),
807: datetime.date(2021, 7, 19),
825: datetime.date(2021, 7, 26),
778: datetime.date(2021, 8, 23),
816: datetime.date(2021, 8, 30),
822: datetime.date(2021, 9, 6),
849: datetime.date(2021, 9, 27),
820: datetime.date(2021, 10, 4),
754: datetime.date(2021, 10, 18),
1031: datetime.date(2021, 9, 6),
1094: datetime.date(2021, 7, 26),
1008: datetime.date(2021, 8, 9),
1089: datetime.date(2021, 10, 4),
1044: datetime.date(2021, 6, 14)},
'Vector': {59: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|2:1/3.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/5.0',
'B|7:1/5.0'],
662: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|5:1/8.0',
'B|6:1/5.0',
'B|7:1/5.0'],
680: ['A|14:1/9.0',
'A|14:1/4.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/5.0',
'B|7:1/5.0'],
725: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|2:1/3.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/5.0',
'B|7:1/5.0'],
709: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|2:1/3.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/5.0',
'B|7:1/5.0'],
703: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|2:1/4.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/6.0',
'B|7:1/5.0'],
653: ['A|14:1/9.0',
'A|15:1/11.0',
'A|16:1/12.0',
'B|11:1/4.0',
'B|2:1/4.0',
'B|3:1/12.0',
'B|4:1/12.0',
'B|5:1/9.0',
'B|6:1/6.0',
'B|7:1/5.0'],
807: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|4:1/2.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0'],
825: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/2.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0'],
778: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0',
'A|8:1/7.0'],
816: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/4.0',
'A|7:1/10.0',
'A|7:1/10.0',
'A|8:1/7.0'],
822: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/2.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/4.0',
'A|7:1/10.0',
'A|7:1/10.0'],
849: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/3.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0'],
820: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/5.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0'],
754: ['A|10:1/13.0',
'A|10:1/13.0',
'A|3:1/6.0',
'A|3:1/6.0',
'A|5:1/3.0',
'A|5:1/2.0',
'A|6:1/5.0',
'A|6:1/5.0',
'A|7:1/10.0',
'A|7:1/10.0'],
1031: ['A|10:1/7.0',
'A|12:1/2.0',
'A|5:1/10.0',
'A|5:1/2.0',
'A|6:1/12.0',
'A|6:1/11.0',
'A|6:1/4.0',
'A|7:1/9.0',
'A|7:1/6.0',
'A|9:1/2.0'],
1094: ['A|10:1/7.0',
'A|12:1/2.0',
'A|5:1/9.0',
'A|6:1/11.0',
'A|6:1/4.0',
'A|7:1/9.0',
'A|7:1/4.0',
'A|8:1/4.0',
'A|8:1/3.0',
'A|9:1/2.0'],
1008: ['A|10:1/7.0',
'A|12:1/2.0',
'A|5:1/9.0',
'A|5:1/4.0',
'A|6:1/11.0',
'A|6:1/4.0',
'A|7:1/9.0',
'A|7:1/9.0',
'A|8:1/4.0',
'A|9:1/2.0'],
1089: ['A|10:1/7.0',
'A|12:1/2.0',
'A|5:1/9.0',
'A|5:1/2.0',
'A|6:1/11.0',
'A|6:1/6.0',
'A|7:1/9.0',
'A|7:1/3.0',
'A|8:1/4.0',
'A|9:1/2.0'],
1044: ['A|10:1/6.0',
'A|10:1/6.0',
'A|5:1/4.0',
'A|5:1/4.0',
'A|6:1/10.0',
'A|6:1/9.0',
'A|6:1/9.0',
'A|7:1/8.0',
'A|7:1/8.0',
'A|8:1/3.0']}}
由於fuzzywuzzy
不接受列表作為輸入,我需要將列表轉換為字符串:
df = pd.DataFrame(data)
df['Vector_string'] = df['Vector'].astype(str)
這使:
unit Date_job Vector Vector_string
59 unit1 2021-06-07 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0']
662 unit1 2021-06-14 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|5:1/8.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0']
680 unit1 2021-07-05 [A|14:1/9.0, A|14:1/4.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|14:1/4.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0']
725 unit1 2021-07-26 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0']
709 unit1 2021-08-30 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0']
703 unit1 2021-10-11 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0']
653 unit1 2021-10-18 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0']
807 unit4 2021-07-19 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|4:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|4:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0']
825 unit4 2021-07-26 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0']
778 unit4 2021-08-23 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0, A|8:1/7.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0']
816 unit4 2021-08-30 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|6:1/5.0, A|6:1/4.0, A|7:1/10.0, A|7:1/10.0, A|8:1/7.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/4.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0']
822 unit4 2021-09-06 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/4.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/4.0', 'A|7:1/10.0', 'A|7:1/10.0']
849 unit4 2021-09-27 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/3.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0']
820 unit4 2021-10-04 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/5.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/5.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0']
754 unit4 2021-10-18 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/3.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0']
1031 unit3 2021-09-06 [A|10:1/7.0, A|12:1/2.0, A|5:1/10.0, A|5:1/2.0, A|6:1/12.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/6.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/10.0', 'A|5:1/2.0', 'A|6:1/12.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/6.0', 'A|9:1/2.0']
1094 unit2 2021-07-26 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/4.0, A|8:1/4.0, A|8:1/3.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0']
1008 unit2 2021-08-09 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/4.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/9.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0']
1089 unit2 2021-10-04 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/2.0, A|6:1/11.0, A|6:1/6.0, A|7:1/9.0, A|7:1/3.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/2.0', 'A|6:1/11.0', 'A|6:1/6.0', 'A|7:1/9.0', 'A|7:1/3.0', 'A|8:1/4.0', 'A|9:1/2.0']
1044 unit5 2021-06-14 [A|10:1/6.0, A|10:1/6.0, A|5:1/4.0, A|5:1/4.0, A|6:1/10.0, A|6:1/9.0, A|6:1/9.0, A|7:1/8.0, A|7:1/8.0, A|8:1/3.0] ['A|10:1/6.0', 'A|10:1/6.0', 'A|5:1/4.0', 'A|5:1/4.0', 'A|6:1/10.0', 'A|6:1/9.0', 'A|6:1/9.0', 'A|7:1/8.0', 'A|7:1/8.0', 'A|8:1/3.0']
現在,我將字符串Vector_string
實例相互比較(對於每個單元)如下:
from fuzzywuzzy import process, fuzz
UNITS = list(set(df.unit.unique()))
fre = []
for unit in UNITS:
d = df[df['unit']==unit]
d = d.reset_index()
if len(d)>1:
d2 = pd.DataFrame([process.extract(d['Vector_string'][i], d[~d.index.isin([i])]['Vector_string'], limit=1)[0] for i in range(len(d))],
index=d.index, columns=['match_Vector', 'match_percent', 'match_index'])
else:
0
final = d.join(d2)
fre.append(final)
dff = pd.concat(fre)
dff = dff.sort_values(['unit','Date_job'])
返回:
index unit Date_job Vector Vector_string match_Vector match_percent match_index
0 59 unit1 2021-06-07 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 3
1 662 unit1 2021-06-14 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|5:1/8.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|14:1/4.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 95 2
2 680 unit1 2021-07-05 [A|14:1/9.0, A|14:1/4.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|14:1/4.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 95 0
3 725 unit1 2021-07-26 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 0
4 709 unit1 2021-08-30 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 0
5 703 unit1 2021-10-11 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] 100 6
6 653 unit1 2021-10-18 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] 100 5
0 1094 unit2 2021-07-26 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/4.0, A|8:1/4.0, A|8:1/3.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] 95 1
1 1008 unit2 2021-08-09 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/4.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/9.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/2.0', 'A|6:1/11.0', 'A|6:1/6.0', 'A|7:1/9.0', 'A|7:1/3.0', 'A|8:1/4.0', 'A|9:1/2.0'] 98 2
2 1089 unit2 2021-10-04 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/2.0, A|6:1/11.0, A|6:1/6.0, A|7:1/9.0, A|7:1/3.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/2.0', 'A|6:1/11.0', 'A|6:1/6.0', 'A|7:1/9.0', 'A|7:1/3.0', 'A|8:1/4.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] 98 1
0 1031 unit3 2021-09-06 [A|10:1/7.0, A|12:1/2.0, A|5:1/10.0, A|5:1/2.0, A|6:1/12.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/6.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/10.0', 'A|5:1/2.0', 'A|6:1/12.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/6.0', 'A|9:1/2.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 99 1
0 807 unit4 2021-07-19 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|4:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|4:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 99 1
1 825 unit4 2021-07-26 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|4:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 99 0
2 778 unit4 2021-08-23 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0, A|8:1/7.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/4.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0'] 99 3
3 816 unit4 2021-08-30 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|6:1/5.0, A|6:1/4.0, A|7:1/10.0, A|7:1/10.0, A|8:1/7.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/4.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0', 'A|8:1/7.0'] 99 2
4 822 unit4 2021-09-06 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/2.0, A|5:1/2.0, A|6:1/5.0, A|6:1/4.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/4.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 99 1
5 849 unit4 2021-09-27 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/3.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 100 7
6 820 unit4 2021-10-04 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/5.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/5.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/2.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 99 1
7 754 unit4 2021-10-18 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.0, A|5:1/3.0, A|5:1/2.0, A|6:1/5.0, A|6:1/5.0, A|7:1/10.0, A|7:1/10.0] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A|3:1/6.0', 'A|5:1/3.0', 'A|5:1/2.0', 'A|6:1/5.0', 'A|6:1/5.0', 'A|7:1/10.0', 'A|7:1/10.0'] 100 5
0 1044 unit5 2021-06-14 [A|10:1/6.0, A|10:1/6.0, A|5:1/4.0, A|5:1/4.0, A|6:1/10.0, A|6:1/9.0, A|6:1/9.0, A|7:1/8.0, A|7:1/8.0, A|8:1/3.0] ['A|10:1/6.0', 'A|10:1/6.0', 'A|5:1/4.0', 'A|5:1/4.0', 'A|6:1/10.0', 'A|6:1/9.0', 'A|6:1/9.0', 'A|7:1/8.0', 'A|7:1/8.0', 'A|8:1/3.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] 95 1
請注意,我已經創建
a) 給出與另一個字符串匹配百分比的列
和
b) 匹配字符串的行的索引。但這並不是我真正想要的。實際上,我希望每個組的第一行與自身 100% 匹配,match_index = 0
並且將另一個字符串與第一個字符串進行比較。
我可以接受的另一種方法如下:
fred = []
for unit in UNITS:
d = df[df['unit']==unit]
d = d.reset_index()
score_sort = [(x,) + i
for x in d['Vector_string']
for i in process.extract(x, d['Vector_string'],scorer=fuzz.token_sort_ratio)]
similarity_sort = pd.DataFrame(score_sort, columns=['Vector_string_r','Matched_vector','match_sort','score_sort'])
final = d.join(similarity_sort)
fred.append(final)
dfff = pd.concat(fred)
這使:
print(dfff.sort_values(['unit','Date_job']).head(10))
index unit Date_job Vector Vector_string Vector_string_r Matched_vector match_sort score_sort
0 59 unit1 2021-06-07 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 0
1 662 unit1 2021-06-14 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|5:1/8.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 3
2 680 unit1 2021-07-05 [A|14:1/9.0, A|14:1/4.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|14:1/4.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 4
3 725 unit1 2021-07-26 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] 98 5
4 709 unit1 2021-08-30 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/3.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/5.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] 98 6
5 703 unit1 2021-10-11 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0'] 100 1
6 653 unit1 2021-10-18 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/4.0, B|2:1/4.0, B|3:1/12.0, B|4:1/12.0, B|5:1/9.0, B|6:1/6.0, B|7:1/5.0] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/6.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|5:1/8.0', 'B|6:1/5.0', 'B|7:1/5.0'] ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', 'B|11:1/4.0', 'B|2:1/3.0', 'B|3:1/12.0', 'B|4:1/12.0', 'B|5:1/9.0', 'B|6:1/5.0', 'B|7:1/5.0'] 96 0
0 1094 unit2 2021-07-26 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/4.0, A|8:1/4.0, A|8:1/3.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] 100 0
1 1008 unit2 2021-08-09 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/4.0, A|6:1/11.0, A|6:1/4.0, A|7:1/9.0, A|7:1/9.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/4.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/9.0', 'A|8:1/4.0', 'A|9:1/2.0'] 97 1
2 1089 unit2 2021-10-04 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/2.0, A|6:1/11.0, A|6:1/6.0, A|7:1/9.0, A|7:1/3.0, A|8:1/4.0, A|9:1/2.0] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/2.0', 'A|6:1/11.0', 'A|6:1/6.0', 'A|7:1/9.0', 'A|7:1/3.0', 'A|8:1/4.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6:1/11.0', 'A|6:1/4.0', 'A|7:1/9.0', 'A|7:1/4.0', 'A|8:1/4.0', 'A|8:1/3.0', 'A|9:1/2.0'] ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5:1/2.0', 'A|6:1/11.0', 'A|6:1/6.0', 'A|7:1/9.0', 'A|7:1/3.0', 'A|8:1/4.0', 'A|9:1/2.0'] 95 2
此方法確實解決了“將第一行與其自身進行比較”的問題,但它不會將每個後續行與第一行進行比較(當然是針對每個單元!)。
任何見解都非常感謝。
如果我理解正確,您希望獲得每個元素與第一個元素的相似性度量,並為每個unit
. 一種方法:
Date_job
所以第一行是明確定義的(未顯示)first_vec
,Vector_string
為每個unit
組重複第一個值fuzz.ratio(Vector_string, first_vec)
每一行first_vec
)df["first_vec"] = df.groupby("unit").Vector_string.transform('first')
df["score"] = df.apply(lambda x: fuzz.ratio(x.Vector_string, x.first_vec), axis=1)
df.drop("first_vec", inplace=True)
輸出:
unit Date_job Vector Vector_string score
59 unit1 2021-06-07 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 100
662 unit1 2021-06-14 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 91
680 unit1 2021-07-05 [A|14:1/9.0, A|14:1/4.0, A|15:1/11.0, A|16:1/1... ['A|14:1/9.0', 'A|14:1/4.0', 'A|15:1/11.0', 'A... 90
725 unit1 2021-07-26 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 100
709 unit1 2021-08-30 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 100
703 unit1 2021-10-11 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 99
653 unit1 2021-10-18 [A|14:1/9.0, A|15:1/11.0, A|16:1/12.0, B|11:1/... ['A|14:1/9.0', 'A|15:1/11.0', 'A|16:1/12.0', '... 99
807 unit4 2021-07-19 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 100
825 unit4 2021-07-26 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 99
778 unit4 2021-08-23 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 90
816 unit4 2021-08-30 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 90
822 unit4 2021-09-06 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 99
849 unit4 2021-09-27 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 99
820 unit4 2021-10-04 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 99
754 unit4 2021-10-18 [A|10:1/13.0, A|10:1/13.0, A|3:1/6.0, A|3:1/6.... ['A|10:1/13.0', 'A|10:1/13.0', 'A|3:1/6.0', 'A... 99
1031 unit3 2021-09-06 [A|10:1/7.0, A|12:1/2.0, A|5:1/10.0, A|5:1/2.0... ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/10.0', 'A|... 100
1094 unit2 2021-07-26 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|6:1/11.0... ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|6... 100
1008 unit2 2021-08-09 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/4.0,... ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5... 89
1089 unit2 2021-10-04 [A|10:1/7.0, A|12:1/2.0, A|5:1/9.0, A|5:1/2.0,... ['A|10:1/7.0', 'A|12:1/2.0', 'A|5:1/9.0', 'A|5... 89
1044 unit5 2021-06-14 [A|10:1/6.0, A|10:1/6.0, A|5:1/4.0, A|5:1/4.0,... ['A|10:1/6.0', 'A|10:1/6.0', 'A|5:1/4.0', 'A|5... 100
您可以使用與上述相同的工具包括第一行的索引值。
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句