`
BradyZhu
  • 浏览: 247740 次
  • 性别: Icon_minigender_1
  • 来自: 上海
社区版块
存档分类
最新评论

开发搜索引擎初步(一)建立索引(Lucene实现)

 
阅读更多

开发自己的搜索引擎完成了一段时间了,现在准备开始梳理一下思路,把以前的总结一下,为以后做真正的“谷歌”埋下伏笔,呵呵。。。。。。

一。Lucene的下载

牛逼的Apache旗下的Lucene,呵呵,无人不知啊,http://lucene.apache.org/,去这个地址自己下载,别说不会Dowmload

二.使用Lucene建立索引

将下载下来的包解压,把里面的Core,memory,analyzer啥的都拿出来,配置到自己的Eclipse上面,下面的事情就是写代码了。

package com.dreamers.creatindex;

import java.io.File;
import java.util.ArrayList;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.store.FSDirectory;
import org.dom4j.DocumentException;
import org.wltea.analyzer.lucene.IKAnalyzer;


import com.dreamers.xml.*;
import com.dreamers.read.*;

/**
 * @category 创建所有XML索引
 * @author bird
 *
 */
public class CreatIndex {
	private String INDEX_STORE_PATH ;
	
	//创建索引
	@SuppressWarnings("deprecation")
	public void creatIndex(){
		try{
			GetPath path = new GetPath();
			INDEX_STORE_PATH = path.getIndexPath();
			File file = new File(INDEX_STORE_PATH);
			Analyzer analyzer = new IKAnalyzer();
			XmlReader xml = new XmlReader();
			FSDirectory directory = FSDirectory.open(file);
			IndexWriter writer = new IndexWriter(directory, analyzer, true,IndexWriter.MaxFieldLength.LIMITED);
			ArrayList<String> lisId = xml.getId();
			ArrayList<String> lisTitle = xml.getTitle();
			ArrayList<String> lisKeyWords = xml.getKeyWords();
			ArrayList<String> lisKind = xml.getKind();
			ArrayList<String> lisDescribe = xml.getDescribe();
			ArrayList<String> lisDate = xml.getDate();
			ArrayList<String> lisUrl = xml.getUrl();
			ArrayList<String> lisAuthor = xml.getAuthor();
			ArrayList<String> lisPublisher = xml.getPublisher();
		
			//System.out.println(lisUrl.get(5));
			for (int i = 0; i < xml.getCount();i++){
				Document doc = new Document();
				//为ID创建Field
				
				Field field = new Field("id",lisId.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED );
				doc.add(field);
				//为title创建索引
				
			    field = new Field("title",lisTitle.get(i),Field.Store.YES,Field.Index.ANALYZED);
				doc.add(field);
				//为keywords创建索引
				
			    field = new Field("keywords",lisKeyWords.get(i),Field.Store.YES,Field.Index.ANALYZED);
				doc.add(field);
				//为kind创建索引
				
				field = new Field("kind",lisKind.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
				doc.add(field);
				//为describe创建索引
				
			     field = new Field("describe",lisDescribe.get(i),Field.Store.YES,Field.Index.ANALYZED);
				doc.add(field);
				//为data创建索引
				
			    field = new Field("date",lisDate.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
				doc.add(field);
				//为URL创建索引
				
				field = new Field("url",lisUrl.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
				doc.add(field);
				//为author创建索引
				
				field = new Field("author",lisAuthor.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
				doc.add(field);
				//为publisher创建索引
				
				field = new Field("publisher",lisPublisher.get(i),Field.Store.YES,Field.Index.NOT_ANALYZED);
				doc.add(field);
				
								}
				
				writer.addDocument(doc);
			}   
			
			
			
			
			 writer.close();
			 //directory.close();
			System.out.println("索引创建完毕");
			
			
		}  catch (Exception e){
			e.printStackTrace();
		
		}
		
	}
  
	public static void main(String [] args) throws DocumentException{
		CreatIndex index = new CreatIndex();
		index.creatIndex();
	}
}


这里不多说,最上面的每个list里面都藏有巨大的信息,都是一些字符串,就当是放到容器里的字符窜吧,然后下面的建立索引的过程都是一样的,代码比较短,就不需要什么注释了,呵呵
分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics