来自数据的不规则采样

汤姆

我有一个数据库,如下所示:

DT <- structure(list(Year = c(2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 
2005, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 
2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2006, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 2007, 
2007, 2007, 2007, 2007, 2007, 2007), Type = c(1, 1, 1, 1, 1, 
1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 
3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 
1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 
3), Value = c(0.504376244608734, 0.544791523560323, 0.536356351248399, 
0.186754489979335, 0.0145059662169885, 0.552467068108315, 0.728991908748136, 
0.0782701833265232, 0.0770140143185365, 0.745720346755096, 0.182549844851049, 
0.0037854136407528, 0.892426526130476, 0.670307075099745, 0.0787676704471466, 
0.243642889274613, 0.61622932816441, 0.773909954748003, 0.0368627127466908, 
0.864836276200213, 0.363247130858897, 0.170719500081567, 0.458862115912474, 
0.764369844834086, 0.22138732039061, 0.950217140815184, 0.119026355092504, 
0.806698643902745, 0.809697143416323, 0.0161168403745759, 0.56149794546334, 
0.0663374185634651, 0.851044662622003, 0.144127493261805, 0.646129610173195, 
0.180326314861961, 0.346305710081752, 0.689186084156133, 0.0902438913162577, 
0.493067567084055, 0.829728867159447, 0.212655417404949, 0.873112880345332, 
0.57019799015934, 0.666924788035991, 0.421470848297274, 0.137822577124685, 
0.646797965126931, 0.00186628356193685, 0.220630784144145, 0.636097250212043, 
0.337161167241577, 0.763014675300797, 0.0290609945874959, 0.179775595422681, 
0.926270372245386, 0.14413707866326, 0.308460218540821, 0.505730133160804, 
0.92831463570813, 0.2406601397661, 0.469013177711661, 0.0514836845684897, 
0.8773477591701, 0.988870207825279, 0.0409427390691713, 0.345261503182235, 
0.457678159145652, 0.928521904779235, 0.981654149874765, 0.165376851871405, 
0.657749413049735, 0.645610554242246, 0.288901032482677, 0.903464871012278, 
0.91288926903878, 0.331819964874993, 0.451775254733976, 0.561567931867726, 
0.934770693643712, 0.0515071551015609, 0.0772762108900331, 0.233674539049138, 
0.636764452840065, 0.673165028674493, 0.806944576060158, 0.763410488346345, 
0.661058275398286, 0.275215831961986, 0.821051953775588), Value2 = c(0.898973133700585, 
0.0043728119746469, 0.90370150590114, 0.664255277142381, 0.478255150030532, 
0.428181937562552, 0.0547471373342867, 0.382060484866744, 0.467990590870777, 
0.44613758335896, 0.767317422802576, 0.378150639908367, 0.490578474103678, 
0.677901331005272, 0.287571260541928, 0.201396158908221, 0.504989505596871, 
0.854550423135574, 0.545208640791417, 0.951248990134053, 0.958420479001103, 
0.916437669811835, 0.299402641214852, 0.966388390213139, 0.511359402704707, 
0.0867219533353825, 0.88481040004275, 0.158676351804193, 0.0723357399252373, 
0.605048894989562, 0.60104443547608, 0.608164723564692, 0.309073275149768, 
0.183031315824665, 0.495737621177827, 0.981936843144856, 0.601436476710344, 
0.442362735422709, 0.497899316486054, 0.0545162134700136, 0.572666465987199, 
0.0134330483790179, 0.494252845049882, 0.752561338910785, 0.269231150235318, 
0.580397043886635, 0.00438648885146109, 0.974859546601355, 0.964309270817873, 
0.740961468264743, 0.966289928060099, 0.165450408579171, 0.457088887715921, 
0.725271665700556, 0.611801886877621, 0.693114823445831, 0.509441044895801, 
0.668642268489104, 0.0769213109282016, 0.0106313240133811, 0.653738670103508, 
0.515077318720933, 0.0355798295524966, 0.916849288357794, 0.489540407953311, 
0.355080030655249, 0.0584185346727107, 0.117505910926226, 0.840486642923002, 
0.0919621689925281, 0.513293731647231, 0.813987689492758, 0.520895630669219, 
0.417642884334403, 0.549898208275446, 0.190152036926942, 0.730222922437507, 
0.247328458018061, 0.587109508511267, 0.850096530635719, 0.929032051736368, 
0.929910983683225, 0.461558252621238, 0.106247873795127, 0.177666580357953, 
0.85962988262837, 0.531897323076434, 0.105528819826748, 0.0349104003049517, 
0.180758384726269), Value3 = c(0.728747048185938, 0.136214396563203, 
0.0552254916905935, 0.888943411458351, 0.593186561829418, 0.142192475897417, 
0.397839605231809, 0.128332683559321, 0.818143628566787, 0.675081193031822, 
0.267554700398382, 0.289692778583473, 0.395043380675461, 0.582592369450023, 
0.999361780203229, 0.421977850130829, 0.723404859329269, 0.333410997686596, 
0.545945290276875, 0.510878802866974, 0.746682101648222, 0.625853669469718, 
0.0366957172106372, 0.417685335838607, 0.106323486037796, 0.0127310987059773, 
0.291264331038641, 0.690392584005106, 0.0367947033685097, 0.287721087095362, 
0.389582158765541, 0.179954765659721, 0.688980485242488, 0.492296704771236, 
0.177765364735501, 0.311877860895471, 0.402659917512069, 0.579307427105039, 
0.588566648357923, 0.741057591300206, 0.111932877257211, 0.515443723005798, 
0.679584351614947, 0.0197622696399569, 0.0326379476305644, 0.736148474541639, 
0.0115696238487739, 0.0530159587501624, 0.710708890129421, 0.537042840144158, 
0.0277825198238522, 0.851349803530179, 0.448963399024373, 0.42841165712813, 
0.0615511042450435, 0.210541933956987, 0.983517611560273, 0.533691182135933, 
0.61993895519575, 0.136074538018663, 0.716185070081669, 0.67982888131481, 
0.186059692566576, 0.0129160598675656, 0.832257317305668, 0.0269936347869698, 
0.579065014243438, 0.857987264303428, 0.270050217297758, 0.606374993010002, 
0.565105220120649, 0.977264711860796, 0.14241840012272, 0.942496958955904, 
0.652070963472916, 0.912867524689929, 0.0249357414986835, 0.87704909395977, 
0.72849611059358, 0.525707690655331, 0.290223239565496, 0.992723233891769, 
0.178173444691217, 0.0292681960925434, 0.65696953770876, 0.452973377851251, 
0.471917712361899, 0.117830393053313, 0.126107861454795, 0.0848074010166607
)), row.names = c(NA, -90L), class = c("tbl_df", "tbl", "data.frame"
))

由此DT,我想按组抽样。就像是:

DT_new <- setDT(DT)[,.SD[sample(.N, min(3,.N))],by = Type]

但是我不想从每种类型中抽取三张纸,而是基于以下数据指定我想要每种类型的多少张纸:

Ratio <- structure(list(Type = c(1, 2, 3), n = c(13L, 13L, 4L)), row.names = c(NA, 
-3L), class = c("data.table", "data.frame"), groups = structure(list(
    Year = c(2005, 2006, 2007), .rows = list(1:3, 4:6, 7:9)), row.names = c(NA, 
-3L), class = c("tbl_df", "tbl", "data.frame"), .drop = TRUE))

比

所需结果:

DT_A <- setDT(DT)[,.SD[sample(.N, min(13,.N))],by = Type]
DT_B <- setDT(DT)[,.SD[sample(.N, min(13,.N))],by = Type]
DT_C <- setDT(DT)[,.SD[sample(.N, min(4,.N))],by = Type]
DT_A <- DT_A[1:13,]
DT_B <- DT_B[14:26,]
DT_C <- DT_C[9:12,]
DT_new <- rbind(DT_A, DT_B, DT_C)
table(DT_new$Type)

 1  2  3 
13 13  4
格雷戈尔·托马斯(Gregor Thomas)

我们可以加入RatioDT再直接使用n的示例命令。

我添加一ID列以使其清楚。当然,您可以在末尾放置IDn列。

DT[, ID := 1:.N, by = Type][Ratio, on = "Type"][, .SD[sample(.N, min(n[1], .N)), ], by = Type]
#     Type Year      Value      Value2     Value3 ID  n
#  1:    1 2006 0.64612961 0.495737621 0.17776536 18 13
#  2:    1 2005 0.18675449 0.664255277 0.88894341  4 13
#  3:    1 2006 0.57019799 0.752561339 0.01976227 27 13
#  4:    1 2006 0.82972887 0.572666466 0.11193288 24 13
#  5:    1 2006 0.87311288 0.494252845 0.67958435 26 13
#  6:    1 2007 0.46901318 0.515077319 0.67982888 31 13
#  7:    1 2006 0.09024389 0.497899316 0.58856665 22 13
#  8:    1 2005 0.50437624 0.898973134 0.72874705  1 13
#  9:    1 2005 0.54479152 0.004372812 0.13621440  2 13
# 10:    1 2005 0.53635635 0.903701506 0.05522549  3 13
# 11:    1 2007 0.05148368 0.035579830 0.18605969 32 13
# 12:    1 2007 0.45767816 0.117505911 0.85798726 37 13
# 13:    1 2006 0.14412749 0.183031316 0.49229670 17 13
# 14:    2 2005 0.07876767 0.287571261 0.99936178  2 13
# 15:    2 2007 0.90346487 0.549898208 0.65207096 28 13
# 16:    2 2005 0.76436984 0.966388390 0.41768534 11 13
# 17:    2 2005 0.67030708 0.677901331 0.58259237  1 13
# 18:    2 2006 0.33716117 0.165450409 0.85134980 19 13
# 19:    2 2007 0.64561055 0.520895631 0.14241840 26 13
# 20:    2 2006 0.63609725 0.966289928 0.02778252 18 13
# 21:    2 2007 0.56156793 0.587109509 0.72849611 32 13
# 22:    2 2007 0.67316503 0.177666580 0.65696954 38 13
# 23:    2 2007 0.93477069 0.850096531 0.52570769 33 13
# 24:    2 2007 0.23367454 0.461558253 0.17817344 36 13
# 25:    2 2007 0.63676445 0.106247874 0.02926820 37 13
# 26:    2 2006 0.92627037 0.693114823 0.21054193 23 13
# 27:    3 2005 0.80669864 0.158676352 0.69039258  2  4
# 28:    3 2006 0.92831464 0.010631324 0.13607454  7  4
# 29:    3 2007 0.76341049 0.531897323 0.47191771  9  4
# 30:    3 2005 0.01611684 0.605048895 0.28772109  4  4
#     Type Year      Value      Value2     Value3 ID  n

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章