使用lucene 4.0版本的全文檢索
所需要的jar包 
網速太慢,下次有空再把jar傳上來
1.FileIndex 建立索引,查詢,刪除,更新
package com.strongit.tool.retrieval;
import java.io.File;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import jcifs.smb.SmbException;
import jcifs.smb.SmbFile;
import jcifs.smb.SmbFileFilter;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.LongField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.Version;
import com.strongit.util.BaseinfoConfigurer;
public class FileIndex {
private static String INDEX_DIR = "D:\\index";
private static Analyzer analyzer = null;
private static Directory directory = null;
private static IndexWriter indexWriter = null;
private static String content = "";
public static void main(String[] args) {
try {
// createIndex();//創建索引
// search("測試");
// insert();//新增索引,不刪除之前的
// delete("1470817624520");
// update();
} catch (Exception e) {
e.printStackTrace();
}
}
/**
* 刪除索引
*
* @param @param str 刪除的關鍵字 建立索引時的id
* @param @throws Exception
* @author wusongxiao
* @date 2016年8月10日
*/
public static void delete(String str) throws Exception {
Date date1 = new Date();
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
directory = FSDirectory.open(new File(INDEX_DIR));
IndexWriterConfig config = new IndexWriterConfig(
Version.LUCENE_CURRENT, analyzer);
indexWriter = new IndexWriter(directory, config);
// indexWriter.deleteDocuments(new Term("filename",str));
indexWriter.deleteDocuments(new Term("id", str)); // 建立索引時 給這個索引賦一個id
indexWriter.close();
Date date2 = new Date();
System.out.println("刪除索引耗時:" + (date2.getTime() - date1.getTime())
+ "ms\n");
}
/**
* 新增加索引,不覆蓋之前的
*
* @Description: TODO
* @param @throws Exception
* @return void
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static void insert(List listname) throws Exception {
// String path = "smb://admini:2014wh@192.168.168.140/resource/Teaching/test001.txt";
for(int j =0;j<listname.size();j++){
String path= listname.get(j); //文件地址
SmbFile folder = new SmbFile(path);
List<SmbFile> fileList = new ArrayList<SmbFile>();
fileList.add(folder);
for (SmbFile file : fileList) {
content = "";
// 獲取文件後綴
String type = file.getName().substring(
file.getName().lastIndexOf(".") + 1);
if ("txt".equalsIgnoreCase(type)) {
content += ReadFile.readTxt(file.getPath(),"gb2312");
} else if ("doc".equalsIgnoreCase(type)) {
content += ReadFile.readWorddoc(file.getPath());
} else if ("xls".equalsIgnoreCase(type)) {
content += ReadFile.xls2String(file.getPath());
} else if ("xlsx".equalsIgnoreCase(type)) {
content += ReadFile.readExcel2007(file.getPath());
} else if ("ppt".equalsIgnoreCase(type)) {
content += ReadFile.readPowerPoint(file.getPath());
} else if ("pdf".equalsIgnoreCase(type)) {
content += ReadFile.readPdf(file.getPath());
}else if ("docx".equalsIgnoreCase(type)) {
content += ReadFile.readWorddocx(file.getPath());
}
// System.out.println("name :" + file.getName());//名稱
// System.out.println("path :" + file.getPath());//地址
// System.out.println("content :"+content);//content內容
try {
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
directory = FSDirectory.open(new File(INDEX_DIR));
File indexFile = new File(INDEX_DIR);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
IndexWriterConfig config = new IndexWriterConfig(
Version.LUCENE_CURRENT, analyzer);
indexWriter = new IndexWriter(directory, config);
// String ID = pathname[1].toString();//賦值一個唯一的ID,方便刪除
Document document = new Document();
document.add(new TextField("filename", file.getName(),
Store.YES));
document.add(new TextField("content", content, Store.YES));
document.add(new TextField("path", file.getPath(), Store.YES));
// document.add(new TextField("id", ID, Store.YES));
indexWriter.addDocument(document);
indexWriter.commit();
ReadFile.closeWriter(indexWriter);
} catch (Exception e) {
e.printStackTrace();
}
content = "";
}
}
}
/**
* 查詢索引
*
* @Description: TODO
* @param @param str 查詢關鍵字
* @param @throws Exception
* @return void
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static List search(String str) throws Exception {
directory = FSDirectory.open(new File(INDEX_DIR));
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
DirectoryReader ireader = DirectoryReader.open(directory);
IndexSearcher isearcher = new IndexSearcher(ireader);
QueryParser parser = new QueryParser(Version.LUCENE_30, "content",
analyzer);//LUCENE_30 不分詞查詢,只搜關鍵詞
Query query = parser.parse(str);
List liatname = new ArrayList();
ScoreDoc[] hits = isearcher.search(query, null, 10000).scoreDocs;
for (int i = 0; i < hits.length; i++) {
Document hitDoc = isearcher.doc(hits[i].doc);
System.out.println(hitDoc.get("filename"));// 文件名
liatname.add(hitDoc.get("filename"));
// System.out.println(hitDoc.get("content"));//內容
}
ireader.close();
directory.close();
return liatname;
}
/**
* 更新索引 更新原來索引的內容---只是改變原來文件的索引
*
* @Description: TODO
* @param @throws Exception
* @return void
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static void update() throws Exception {
String path = "D:\\file\\file\\f1\\test2.txt";
SmbFile folder = new SmbFile(path);
List<SmbFile> fileList = new ArrayList<SmbFile>();
fileList.add(folder);
Date date1 = new Date();
for (SmbFile file : fileList) {
content = "";
// 獲取文件後綴
String type = file.getName().substring(
file.getName().lastIndexOf(".") + 1);
if ("txt".equalsIgnoreCase(type)) {
content += ReadFile.readTxt(file.getPath(),"gb2312");
} else if ("doc".equalsIgnoreCase(type)) {
content += ReadFile.readWorddoc(file.getPath());
} else if ("xls".equalsIgnoreCase(type)) {
content += ReadFile.xls2String(file.getPath());
} else if ("xlsx".equalsIgnoreCase(type)) {
content += ReadFile.readExcel2007(file.getPath());
} else if ("ppt".equalsIgnoreCase(type)) {
content += ReadFile.readPowerPoint(file.getPath());
} else if ("pdf".equalsIgnoreCase(type)) {
content += ReadFile.readPdf(file.getPath());
}else if ("docx".equalsIgnoreCase(type)) {
content += ReadFile.readWorddocx(file.getPath());
}
System.out.println("name :" + file.getName());
System.out.println("path :" + file.getPath());
// System.out.println("content :"+content);//content內容
System.out.println();
try {
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
directory = FSDirectory.open(new File(INDEX_DIR));
File indexFile = new File(INDEX_DIR);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
IndexWriterConfig config = new IndexWriterConfig(
Version.LUCENE_CURRENT, analyzer);
indexWriter = new IndexWriter(directory, config);
Long time = date1.getTime();
String tt = time.toString();
Document document = new Document();
document.add(new TextField("filename", file.getName(),
Store.YES));
document.add(new TextField("content", content, Store.YES));
document.add(new TextField("path", file.getPath(), Store.YES));
document.add(new TextField("id", tt, Store.YES));
indexWriter.updateDocument(new Term("filename", "text1"),
document);
indexWriter.close();
ReadFile.closeWriter(indexWriter);
} catch (Exception e) {
e.printStackTrace();
}
content = "";
}
Date date2 = new Date();
System.out.println("更新索引耗時:" + (date2.getTime() - date1.getTime())
+ "ms\n");
}
/**
* 創建索引,刪除之前的索引,更新全部文件的索引
*
* @Description: TODO
* @param @param path
* @param @return
* @param @throws Exception
* @return boolean
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static boolean createIndex(String path) throws Exception {
// public static boolean createIndex() throws Exception {
Date date1 = new Date();
String username = (String) BaseinfoConfigurer
.getContextProperty("username");
String possword = (String) BaseinfoConfigurer
.getContextProperty("possword");
String fileServerIp = (String) BaseinfoConfigurer
.getContextProperty("fileServerIp");
String sharedirectory = (String) BaseinfoConfigurer
.getContextProperty("sharedirectory");
path = "smb" + "://" + username + ":" + possword + "@" + fileServerIp + "/" + sharedirectory + "/";
//刪除之前索引
ReadFile.deleteDir(new File(INDEX_DIR+"\\"));
// String path = "smb://admini:2014wh@192.168.168.140/resource/";
SmbFile folder = new SmbFile(path);
SmbFile[] result = searchFile(folder);// 根目錄下的所有文件夾文件
for (SmbFile file : result) {
content = "";
// 獲取文件後綴
String type = file.getName().substring(
file.getName().lastIndexOf(".") + 1);
if ("txt".equalsIgnoreCase(type)) {
content += ReadFile.readTxt(file.getPath(),"gb2312");
} else if ("doc".equalsIgnoreCase(type)) {
content += ReadFile.readWorddoc(file.getPath());
} else if ("xls".equalsIgnoreCase(type)) {
content += ReadFile.xls2String(file.getPath());
} else if ("xlsx".equalsIgnoreCase(type)) {
content += ReadFile.readExcel2007(file.getPath());
} else if ("ppt".equalsIgnoreCase(type)) {
content += ReadFile.readPowerPoint(file.getPath());
} else if ("pdf".equalsIgnoreCase(type)) {
content += ReadFile.readPdf(file.getPath());
}else if ("docx".equalsIgnoreCase(type)) {
content += ReadFile.readWorddocx(file.getPath());
}
System.out.println("name :" + file.getName());
System.out.println("path :" + file.getPath());
// System.out.println("content :"+content);
System.out.println();
try {
analyzer = new StandardAnalyzer(Version.LUCENE_CURRENT);
directory = FSDirectory.open(new File(INDEX_DIR));
File indexFile = new File(INDEX_DIR);
if (!indexFile.exists()) {
indexFile.mkdirs();
}
IndexWriterConfig config = new IndexWriterConfig(
Version.LUCENE_CURRENT, analyzer);
indexWriter = new IndexWriter(directory, config);
Document document = new Document();
document.add(new TextField("filename", file.getName(), Store.YES));
document.add(new TextField("content", content, Store.YES));
document.add(new TextField("path", file.getPath(), Store.YES));
// document.add(new TextField("id", tt, Store.YES));
indexWriter.addDocument(document);
indexWriter.commit();
ReadFile.closeWriter(indexWriter);
} catch (Exception e) {
e.printStackTrace();
}
content = "";
}
Date date2 = new Date();
System.out.println("創建索引-----耗時:" + (date2.getTime() - date1.getTime())
+ "ms\n");
return true;
}
/**
* 遞歸查找所有的文件
*
* @Description: TODO
* @param @param folder
* @param @return
* @return SmbFile[]
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static SmbFile[] searchFile(SmbFile folder) {
SmbFile[] subFolders = null;
try {
subFolders = folder.listFiles(new SmbFileFilter() { // 運用內部匿名類獲得文件
@Override
public boolean accept(SmbFile pathname) {// 實現FileFilter類的accept方法
try {
if (pathname.isDirectory()
|| (pathname.isFile())) {// 目錄或文件包含關鍵字
return true;
}
} catch (SmbException e) {
}
return false;
}
});
} catch (SmbException e1) {
e1.printStackTrace();
}
List<SmbFile> result = new ArrayList<SmbFile>();// 聲明一個集合
for (int i = 0; i < subFolders.length; i++) {// 循環顯示文件夾或文件
try {
if (subFolders[i].isFile()) {// 如果是文件則將文件添加到結果列表中
result.add(subFolders[i]);
} else {// 如果是文件夾,則遞歸調用本方法,然後把所有的文件加到結果列表中
SmbFile[] foldResult = searchFile(subFolders[i]);
for (int j = 0; j < foldResult.length; j++) {// 循環顯示文件
String smname = foldResult[j].toString();
String txtname = smname.substring(smname
.lastIndexOf("/") + 1);// 截取文件名
String txtName = txtname.substring(txtname
.lastIndexOf("."));// 截取格式
if (".txt".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
} else if (".ppt".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
} else if (".doc".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
} else if (".xls".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
} else if (".xlsx".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
} else if (".pdf".equals(txtName)) {
result.add(foldResult[j]);// 文件保存到集合中
}
}
}
} catch (SmbException e) {
e.printStackTrace();
}
}
SmbFile files[] = new SmbFile[result.size()];// 聲明文件數組,長度為集合的長度
result.toArray(files);// 集合數組化
return files;
}
}
2.讀取文檔的方法類 txt,xlsx,xls,ppt,pdf,doc, docx(不能讀取圖片)
package com.strongit.tool.retrieval;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStreamReader;
import java.util.List;
import jcifs.smb.SmbFileInputStream;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import org.apache.lucene.index.IndexWriter;
import org.apache.pdfbox.pdfparser.PDFParser;
import org.apache.pdfbox.util.PDFTextStripper;
import org.apache.poi.hslf.HSLFSlideShow;
import org.apache.poi.hslf.model.Slide;
import org.apache.poi.hslf.model.TextRun;
import org.apache.poi.hslf.usermodel.SlideShow;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.usermodel.Paragraph;
import org.apache.poi.hwpf.usermodel.Range;
import org.apache.poi.xssf.usermodel.XSSFCell;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.poi.xwpf.usermodel.XWPFParagraph;
/**
* 讀取文檔方法
* ClassName: ReadFile
* @Description: TODO
* @date 2016年8月10日
* @author wsx
*/
public class ReadFile {
private static ReadFile indexManager;
/**
* 讀取doc文件內容
* @param filepath 想要讀取的文件地址
* @return 返回文件內容
*/
public static String readWorddoc(String filepath) {
StringBuffer content = new StringBuffer("");// 文檔內容
try {
HWPFDocument doc = new HWPFDocument(new SmbFileInputStream(filepath));
Range range = doc.getRange();
int paragraphCount = range.numParagraphs();// 段落
for (int i = 0; i < paragraphCount; i++) {// 遍歷段落讀取數據
Paragraph pp = range.getParagraph(i);
content.append(pp.text());
}
} catch (Exception e) {
e.printStackTrace();
}
return content.toString().trim();
}
/**
* docx 格式建立索引,圖片沒有讀到,只讀取的數據
* @Description: TODO
* @param @param filepath
* @param @return
* @return String
* @date 2016年8月12日
*/
public static String readWorddocx(String filepath) {
StringBuffer content = new StringBuffer("");// 文檔內容
try {
// D://file//docx.docx D://file//doc.doc
// filepath = "D://file//docx.docx";
SmbFileInputStream in = new SmbFileInputStream(filepath);//載入文檔
//word docx 圖片不會被讀取,只讀取數據
XWPFDocument xwpf = new XWPFDocument(in);//得到word文檔的信息
List<XWPFParagraph> listParagraphs = xwpf.getParagraphs();//得到段落信息
for(int i =0;i<listParagraphs.size();i++){
String cont = listParagraphs.get(i).getRuns().toString();
content.append(cont);
}
} catch (Exception e) {
e.printStackTrace();
}
return content.toString().trim();
}
/**
* 讀取xls文件內容
* @param filepath 想要讀取的文件對象
* @return 返回文件內容
*/
public static String xls2String(String filepath){
String result = "";
try{
SmbFileInputStream fis = new SmbFileInputStream(filepath);
StringBuilder sb = new StringBuilder();
jxl.Workbook rwb = Workbook.getWorkbook(fis);
Sheet[] sheet = rwb.getSheets();
for (int i = 0; i < sheet.length; i++) {
Sheet rs = rwb.getSheet(i);
for (int j = 0; j < rs.getRows(); j++) {
Cell[] cells = rs.getRow(j);
for(int k=0;k<cells.length;k++)
sb.append(cells[k].getContents() + " ");
}
}
fis.close();
result += sb.toString();
}catch(Exception e){
e.printStackTrace();
}
return result;
}
/**
* PDF格式 文件創建索引
* @Description: TODO
* @param @param path
* @param @return
* @param @throws Exception
* @return String
* @date 2016年8月11日
*/
public static String readPdf(String path) throws Exception {
StringBuffer content = new StringBuffer("");// 文檔內容
SmbFileInputStream fis = new SmbFileInputStream(path);
PDFParser p = new PDFParser(fis);
p.parse();
PDFTextStripper ts = new PDFTextStripper();
content.append(ts.getText(p.getPDDocument()));
fis.close();
return content.toString().trim();
}
/**
* 讀取xlsx格式的excel文檔
* @param @param filepath
* @param @throws IOException
* @author wusongxiao
* @date 2016年8月10日
*/
public static String readExcel2007(String filepath) throws IOException {
// System.out.println(filepath);
StringBuffer content = new StringBuffer();
// 構造 XSSFWorkbook 對象,strPath 傳入文件路徑 **** SmbFileInputStream SMB讀取文件 ***
XSSFWorkbook xwb = new XSSFWorkbook(new SmbFileInputStream(filepath));
// 循環工作表Sheet
for (int numSheet = 0; numSheet < xwb.getNumberOfSheets(); numSheet++) {
XSSFSheet xSheet = xwb.getSheetAt(numSheet);
if (xSheet == null) {
continue;
}
// 循環行Row
for (int rowNum = 0; rowNum <= xSheet.getLastRowNum(); rowNum++) {
XSSFRow xRow = xSheet.getRow(rowNum);
if (xRow == null) {
continue;
}
// 循環列Cell
for (int cellNum = 0; cellNum <= xRow.getLastCellNum(); cellNum++) {
XSSFCell xCell = xRow.getCell(cellNum);
if (xCell == null) {
continue;
}
String s = null;
if (xCell.getCellType() == XSSFCell.CELL_TYPE_BOOLEAN) {
content.append(xCell.getBooleanCellValue());
} else if (xCell.getCellType() == XSSFCell.CELL_TYPE_NUMERIC) {
content.append(xCell.getNumericCellValue());
} else {
content.append(xCell.getStringCellValue() + ""); //+ "\n"
}
}
}
}
return content.toString();
}
/**
* 讀取txt文檔
* @param @param filepath 地址
* @param @param charSet 編碼格式
* @param @throws IOException
* @author wusongxiao
* @date 2016年8月10日
*/
public static String readTxt(String filepath, String charSet)
throws IOException {
BufferedReader reader = new BufferedReader(new InputStreamReader(
new SmbFileInputStream(filepath), charSet)); //reader.readLine() 讀取txt文本 String的
String line = new String();
String temp = new String();
while ((line = reader.readLine()) != null) {
temp += line;
}
reader.close();
return temp;
}
/**
* 讀取ppt文件
* @Description: TODO
* @param @param filepath
* @param @return
* @return String
* @date 2016年8月10日
*/
public static String readPowerPoint(String filepath) {
StringBuffer content = new StringBuffer("");
try {
SlideShow ss = new SlideShow(new HSLFSlideShow(new SmbFileInputStream(filepath)));// is
// 為文件的InputStream,建立SlideShow
Slide[] slides = ss.getSlides();// 獲得每一張幻燈片
for (int i = 0; i < slides.length; i++) {
TextRun[] t = slides[i].getTextRuns();// 為了取得幻燈片的文字內容,建立TextRun
for (int j = 0; j < t.length; j++) {
content.append(t[j].getText());// 這裡會將文字內容加到content中去
}
}
} catch (Exception ex) {
System.out.println(ex.toString());
}
return content.toString();
}
public static void closeWriter(IndexWriter indexWriter) throws Exception {
if (indexWriter != null) {
indexWriter.close();
}
}
/**
* 創建索引管理器
* @return 返回索引管理器對象
*/
public ReadFile getManager(){
if(indexManager == null){
this.indexManager = new ReadFile();
}
return indexManager;
}
/**
* 刪除目錄下的所有索引
* @Description: TODO
* @param @param file
* @param @return
* @return boolean
* @throws
* @author wusongxiao
* @date 2016年8月10日
*/
public static boolean deleteDir(File file){
if(file.isDirectory()){
File[] files = file.listFiles();
for(int i=0; i<files.length; i++){
deleteDir(files[i]);
}
}
file.delete();
return true;
}
}
整個都是基於SMB 文件服務器的lucene4.0全文檢索,如果是本地文件的話 只需要把所有的地址 類似 SmbFileInputStream 去掉 Smb 就可以了