我有以下df(dput()
在问题末尾):
link person type ID1 bin
1311752 793482 514677 departure 347 36900
1020897 66154 514677 arrival 379 36900
1376554 858880 514679 departure 374 42300
1374956 858880 514679 arrival 374 46800
1567933 992830 514680 arrival 140 31500
1568804 992830 514680 departure 140 32400
726546 87723 514681 departure 42 25200
726547 481929 514681 arrival 233 27900
723875 481929 514681 departure 233 45000
1396595 87723 514681 arrival 42 45900
我想要创建一个新的df,在其中我看到哪个person
从(type=="departure"
)哪个区域(ID1
)到(type=="arrival"
)哪个区域(ID1
)以及在哪个出发点bin
。
从上面示例的最后4行可以看出,同一行可能person
有多个arrival
和departure
。因此,重要的是将彼此最接近的两个组合(因此最小的delta bin
)。
因此,新的df应该如下所示:
person departure arrival bin
514677 347 379 36900
514679 374 374 42300
514681 42 233 25200
514681 233 42 45000
...
如您所见,此人514681
在42到233之间往返。
我想您可以说我想基于来汇总数据person
。我不太确定如何做到这一点,并且会很高兴获得帮助!
structure(list(link = c("793482", "66154", "858880", "858880",
"992830", "992830", "858880", "858880", "514143", "87723", "481929",
"481929", "87723", "87723", "964586", "87723", "1013453", "pt_8591034",
"973606", "pt_8591034", "481929", "481929", "87723", "87723",
"399738", "399738", "87723", "738635", "481929", "738635", "481929",
"738635", "301867", "89053", "92917", "92917", "981899", "981899",
"89053", "92917", "89053", "89053", "699715", "699715", "89053",
"452176", "738635", "452176", "738635", "641738", "712438", "699715",
"215822", "621953", "634264", "362138", "634264", "634264", "763815",
"763815", "631191", "631191", "763815", "763815", "634264", "634264",
"10826", "10826", "634264", "999977", "1046826", "1046826", "634264",
"999977", "848815", "848815", "999977", "999977", "848815", "848815",
"999977", "261239", "993653", "287772", "763815", "763815", "287772",
"494898", "824071", "594222", "494898", "494898", "452642", "452642",
"494898", "494898", "452642", "494898", "787840", "442285"),
person = c(514677L, 514677L, 514679L, 514679L, 514680L, 514680L,
514680L, 514680L, 514680L, 514681L, 514681L, 514681L, 514681L,
514682L, 514682L, 514683L, 514683L, 514684L, 514684L, 514684L,
514684L, 514684L, 514684L, 514685L, 514685L, 514685L, 514685L,
514688L, 514688L, 514688L, 514688L, 514690L, 514690L, 514707L,
514707L, 514707L, 514707L, 514707L, 514707L, 514708L, 514708L,
514709L, 514709L, 514709L, 514709L, 514710L, 514710L, 514710L,
514710L, 514711L, 514711L, 514733L, 514733L, 514757L, 514757L,
514757L, 514757L, 514759L, 514759L, 514759L, 514759L, 514759L,
514759L, 514759L, 514759L, 514760L, 514760L, 514760L, 514760L,
514774L, 514774L, 514774L, 514774L, 514774L, 514774L, 514774L,
514774L, 514774L, 514774L, 514774L, 514774L, 514782L, 514782L,
514786L, 514786L, 514786L, 514786L, 514802L, 514802L, 514802L,
514802L, 514803L, 514803L, 514803L, 514803L, 514803L, 514803L,
514804L, 514804L, 514805L), type = c("departure", "arrival",
"departure", "arrival", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"arrival", "departure", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"arrival", "departure", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure", "arrival", "departure",
"arrival", "departure", "arrival", "departure", "arrival",
"departure", "arrival", "departure"), ID1 = c(347L, 379L,
374L, 374L, 140L, 140L, 374L, 374L, 401L, 42L, 233L, 233L,
42L, 42L, 496L, 42L, 235L, 172L, 231L, 172L, 233L, 233L,
42L, 42L, 267L, 267L, 42L, 58L, 233L, 58L, 233L, 58L, 292L,
96L, 264L, 264L, 26L, 26L, 96L, 264L, 96L, 96L, 266L, 266L,
96L, 57L, 58L, 57L, 58L, 123L, 55L, 266L, 77L, 412L, 407L,
410L, 407L, 407L, 297L, 297L, 169L, 169L, 297L, 297L, 407L,
407L, 566L, 566L, 407L, 403L, 380L, 380L, 407L, 403L, 436L,
436L, 403L, 403L, 436L, 436L, 403L, 545L, 324L, 448L, 297L,
297L, 448L, 209L, 152L, 205L, 209L, 209L, 204L, 204L, 209L,
209L, 204L, 209L, 383L, 181L), bin = c(36900, 36900, 42300,
46800, 31500, 32400, 34200, 61200, 72000, 25200, 27900, 45000,
45900, 27900, 31500, 30600, 33300, 56700, 56700, 60300, 61200,
63900, 65700, 28800, 32400, 75600, 77400, 24300, 26100, 45900,
47700, 51300, 53100, 27900, 31500, 36900, 38700, 41400, 42300,
62100, 63900, 15300, 16200, 45900, 47700, 35100, 35100, 67500,
67500, 63900, 65700, 45900, 46800, 63000, 64800, 90000, 90900,
27000, 28800, 45000, 46800, 63000, 67500, 68400, 72000, 48600,
50400, 53100, 55800, 37800, 38700, 40500, 41400, 47700, 48600,
51300, 52200, 62100, 63000, 63900, 64800, 22500, 24300, 19800,
20700, 54000, 57600, 21600, 22500, 55800, 56700, 36900, 38700,
51300, 52200, 62100, 63900, 30600, 34200, 33300)), row.names = c(1311752L,
1020897L, 1376554L, 1374956L, 1567933L, 1568804L, 1378922L, 1378604L,
796332L, 1396610L, 726547L, 723875L, 1396595L, 1398179L, 1513826L,
1396821L, 27568L, 1607272L, 1542798L, 1606758L, 721147L, 720198L,
1397549L, 1398033L, 621908L, 626763L, 1397225L, 1184210L, 729418L,
1185149L, 728038L, 1183207L, 465692L, 1426554L, 1460666L, 1457289L,
1551709L, 1550925L, 1429217L, 1461183L, 1427235L, 1428930L, 1117203L,
1119814L, 1431191L, 683362L, 1183236L, 684338L, 1184602L, 990483L,
1142535L, 1125893L, 288323L, 947577L, 966966L, 544664L, 969372L,
964351L, 1230180L, 1231933L, 960341L, 960438L, 1238195L, 1241438L,
965853L, 964667L, 105034L, 106179L, 966724L, 1586724L, 72120L,
76232L, 969048L, 1587092L, 1360314L, 1361152L, 1585499L, 1586919L,
1361549L, 1359631L, 1587109L, 378600L, 1573640L, 428095L, 1236840L,
1235995L, 430374L, 757402L, 1331227L, 916001L, 760182L, 757620L,
686066L, 685765L, 759010L, 760140L, 687455L, 758617L, 1287055L,
676520L), class = "data.frame")
使用dplyr
,可能是这样的:
library(dplyr)
data %>%
group_by(person) %>%
summarize(departure = ID1[type == "departure"][1],
arrival = ID1[type == "arrival"][1],
bin = bin[type == "departure"][1])
这里是一个试图满足附加逻辑的改进版本。由于问题还不太清楚,所以我可以肯定它还不是100%罚款。
data %>%
arrange(person, bin) %>%
group_by(person) %>%
mutate(pair = rep(row_number(), each = 2, length.out = n())) %>%
group_by(person, pair) %>%
summarize(departure = ID1[type == "departure"][1],
arrival = ID1[type == "arrival"][1],
bin = bin[type == "departure"][1])
本文收集自互联网,转载请注明来源。
如有侵权,请联系 [email protected] 删除。
我来说两句