java超快速文本去重复代码

import java.io.*;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;

public class SpeedClear {

	public static void main(String[] args) {
		if(args.length==0){
			print();
			System.exit(1);
		}
		if(args.length!=2){
			System.out.println("Format error...");
			System.exit(1);
		}
		String pathname = args[0];
		String newPath = args[1];
		clear(pathname,newPath);     //调用去重复的方法...
	}

	/**
	 * 
	 * @param pathname
	 *            源文件路径
	 * @param newPath
	 *            新的文件路径
	 * @throws Exception
	 */
	public static void clear(String pathname, String newPath) {

		System.out.println("Start... ");
		
		try{	//懒的写Try..直接都包围起来吧....

			File file = new File(pathname);
		BufferedInputStream fis = new BufferedInputStream(new FileInputStream(file));
		
BufferedReader buffer = new BufferedReader(new InputStreamReader(fis,"utf-8"),20*1024*1024);// 用5M的缓冲读取文本文件 

			//FileWriter fw  = new FileWriter(new File(newPath),true);  //去除后的文本

			OutputStreamWriter out = new OutputStreamWriter(new FileOutputStream(new File(newPath)),"utf-8") ;

			Set<String> set = new HashSet<String>();
			String temp = ""; // 临时字符串
			int x = 0;
			while ((temp = buffer.readLine()) != null) { // 读文件,一行读一个
				set.add(temp); // 存储到Set集合里面
				if(x%30000==0){
					System.out.print("..") ;
				}
				x++;
			}
			fis.close();
			buffer.close();   //关闭读取操作
			
			//下面开始写文件
			
			for (String xxser : set) {
				out.write(xxser+"\r\n");
				
			}
			System.out.println("") ;
			out.close();   //关闭写操作
			System.out.println("size = " + set.size());
			System.out.println("End...");
		}catch(Exception e){
		
			System.out.println("文件太大了,建议先100MB大小..") ;
		}
		
		
	}


		public static void  print(){
		System.out.println("*************************************************");
		System.out.println("\t\tTo repeat \t\t");
		System.out.println();
		System.out.println("  format: java -Xmx1000m SpeedClear c:\\old.txt c:\\new.txt\t\t");
		System.out.println();
		System.out.println("\t\tAuthor:xxser	QQ:616100108");
		System.out.println("*************************************************");
		
	}

}
 

编程技巧