单核内存解决topk问题
作者:互联网
这种是单核固定内存
就是将文件分为多个小文件,维护一个小顶堆,读取每个文件,使用hashmap计数
这样可以不用一下全部都读入到内存中
package com.qiqi.topk;
/**
* Created By 丛梓祺 on 2021/9/28
* Write this code and change the world
*/
import org.omg.PortableInterceptor.INACTIVE;
import java.awt.image.ImageProducer;
import java.io.*;
import java.util.*;
import java.util.concurrent.TimeUnit;
/**
* topk 内存固定,单核的方法
* <p>
* 1.按行读取文件
* 2.hash分文件
* 3.读取单个小文件
* 4.使用map计数
* 5.维护小顶堆
*/
public class TopK {
public static void main(String[] args) throws FileNotFoundException {
//读取文件
File inputFile = new File("d:\\bigdata.text");
FileInputStream inputStream = new FileInputStream(inputFile);
BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream));
try {
//文件大小
final int divNum = 1000;
//读取的行
String str;
//输出的小文件
File outputFile;
BufferedWriter output;
//用来计数
int time = 0;
System.out.println("开始分文件");
File menu = new File("D:\\div");
if (!menu.exists()) {
menu.mkdir();
}
while ((str = bufferedReader.readLine()) != null) {
time++;
if (time % 1000 == 0) {
System.out.println(time + "次了" + str);
}
//区分此文件分到哪个文件中
int order = str.hashCode() & divNum;
outputFile = new File("D:\\div\\d" + order + "file.txt");
if (!outputFile.exists()) {
outputFile.createNewFile();
}
output = new BufferedWriter(new FileWriter(outputFile, true));
output.write(str);
output.newLine();
output.flush();
}
System.out.println("分文件完毕");
//维护一个小顶堆
Queue<Map.Entry<String, Integer>> queue = new PriorityQueue<>(10, new Comparator<Map.Entry<String, Integer>>() {
@Override
public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
return o1.getValue() - o2.getValue();
}
});
String[] strArray;
/**
* 遍历每个小文存放到map集合中
*/
for (int i = 0; i < divNum; i++) {
System.out.println("第" + i + "个");
inputFile = new File("D:\\div\\d" + i + "file.txt");
if (!inputFile.exists())
continue;
str = readToString(inputFile);
strArray = str.split(System.lineSeparator());
Map<String, Integer> map = new HashMap<>();
for (String s : strArray) {
map.put(s, map.getOrDefault(s, 1) + 1);
}
for (Map.Entry<String, Integer> entry : map.entrySet()) {
if (queue.size() < 10) {
queue.add(entry);
} else {
if (queue.peek().getValue() < entry.getValue()) {
queue.poll();
queue.add(entry);
}
}
}
}
Iterator<Map.Entry<String,Integer>> iterator = queue.iterator();
while (iterator.hasNext()){
Map.Entry<String, Integer> next = iterator.next();
System.out.println(next.getKey()+"---"+next.getValue());
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
bufferedReader.close();
inputStream.close();
} catch (IOException e) {
e.printStackTrace();
}
}
}
//一次性获取到所有的数据 如果内存不够可以一行一行的读取
public static String readToString(File file) {
Long filelength = file.length(); //获取文件长度
byte[] filecontent = new byte[filelength.intValue()];
try {
FileInputStream in = new FileInputStream(file);
in.read(filecontent);
in.close();
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return new String(filecontent);//返回文件内容,默认编码
}
}
标签:文件,单核,str,System,queue,topk,内存,File,new 来源: https://blog.csdn.net/qqqq157/article/details/120536203