从给定的list中作数据抽样,须要保证采样结果数据的分布平衡。 java
/** * 从min - max之间取出总数为items的随机数 * @param min * @param items * @param max * @return */ private static List<Integer> getRandomId(int min,int items,int max){ List<Integer> ids = Lists.newArrayList(); while(ids.size()<items){ int randomId = ThreadLocalRandom.current().nextInt(max)+min; if(!ids.contains(randomId))ids.add(randomId); } ids.sort((x,y)->x-y); return ids; } /** * 从给定的list中作数据抽样,须要保证采样数据的均匀分布 * 步骤: * 1:将原数据分 m 份, * 2:取得每份的最小index和最大index * 3:从最小index到最大index之间取 n 个list id * 4:从原list中取出对应的id数据 * @param srcDatas 源数据 * @param sampleTotal 抽样的数据总数 * @param splitCopies 将原数据拆分的份数 */ public static List<String> sampleData(List<String> srcDatas,int sampleTotal,int splitCopies){ if(splitCopies<=0)splitCopies = 1; int items = sampleTotal/splitCopies; //从每份中抽取的数据,数据总数将等于sampleTotal List<String> filterRes = Lists.newArrayList();//用于保存最终的抽样结果 ListSplit<String> listSplit = new ListSplit<>();//对list作拆分算法,源码:https://my.oschina.net/u/2391658/blog/703032 List<List<String>> splitRes = listSplit.split(srcDatas,splitCopies); int preListSize = 0;//初始化第一份List的最小下标 for(int i=0;i<splitCopies;i++){ List<String> listBlock = splitRes.get(i);//取出拆分后的list单元 System.out.println(preListSize+"-->"+(preListSize+listBlock.size()-1)); List<Integer> ids = getRandomId(preListSize,items,listBlock.size());//取到排序后的抽样数据id System.out.println(Arrays.toString(ids.toArray())); ids.forEach(id->filterRes.add(srcDatas.get(id)));//取出list 下标id对应的值 preListSize = preListSize+listBlock.size(); //从新初始化list的最小下标 } System.out.println("抽样结果:"); System.out.println(Arrays.toString(filterRes.toArray())); return filterRes; } public static void main(String args[]){ List<String> datas = Lists.newArrayList(); for(int y=0;y<1004;y++)datas.add(y+"");//构造数据 sampleData(datas,100,10); }
执行结果: 算法
0-->99 [5, 6, 25, 28, 29, 38, 48, 69, 72, 81] //从每份list中抽样结果 100-->199 [104, 142, 144, 145, 159, 164, 172, 174, 180, 188] 200-->299 [207, 212, 219, 228, 239, 250, 264, 281, 298, 299] 300-->399 [309, 313, 316, 324, 329, 331, 363, 364, 368, 377] 400-->499 [401, 417, 429, 441, 442, 448, 453, 484, 490, 493] 500-->599 [527, 532, 537, 544, 555, 556, 575, 584, 593, 596] 600-->699 [601, 628, 649, 655, 656, 659, 662, 675, 684, 696] 700-->799 [707, 709, 729, 734, 752, 763, 767, 770, 773, 774] 800-->899 [804, 820, 828, 831, 837, 848, 858, 865, 887, 893] 900-->999 [914, 926, 929, 940, 943, 954, 964, 979, 981, 998] 抽样结果: [5, 6, 25, 28, 29, 38, 48, 69, 72, 81, 104, 142, 144, 145, 159, 164, 172, 174, 180, 188, 207, 212, 219, 228, 239, 250, 264, 281, 298, 299, 309, 313, 316, 324, 329, 331, 363, 364, 368, 377, 401, 417, 429, 441, 442, 448, 453, 484, 490, 493, 527, 532, 537, 544, 555, 556, 575, 584, 593, 596, 601, 628, 649, 655, 656, 659, 662, 675, 684, 696, 707, 709, 729, 734, 752, 763, 767, 770, 773, 774, 804, 820, 828, 831, 837, 848, 858, 865, 887, 893, 914, 926, 929, 940, 943, 954, 964, 979, 981, 998]