功能包括:创建索引、检索索引、高亮显示查询结果。分词使用的庖丁解牛。
使用前先下载相关的LuceneCore jar包、LuceneHighLighter jar包、庖丁解牛分分词jar包、庖丁解牛词典。并设定环境变量PAODING_DIC_HOME指向词典位置。
前两个可以到官方网站找,庖丁去http://code.google.com/p/paoding/downloads/list下载。
Lucene庖丁整合方式1:
1、将paoding-analysis.jar拷贝到项目的WEB-INF/lib目录;
2、接着需要设置环境变量PAODING_DIC_HOME,变量名:PAODING_DIC_HOME 变量值:E:\paoding\dic
3、第三步将E:\paoding\src目录下的paoding-dic-home.properties属性文件拷贝到项目的src目录下,添加2行
paoding.dic.home.config-fisrt=this
paoding.dic.home=E:/paoding/dic
Lucene庖丁整合方式2:
修改E:\paoding\src\paoding-dic-home.properties,增加一行
paoding.dic.home=classpath:dic
然后运行ant重新生成一个庖丁jar,拷贝到lib下就OK了。
第一种方式便于更新字典,第二种便于移植。本例使用第二种方法整合。
关于庖丁环境的设置可以参考net\paoding\analysis\Constants.java。
使用时注意LuceneCore和LuceneHighLighter的版本配置。我开始使用lucene-core-2.3.2.jar+Highlighter 2.4,后台报错,明显的版本问题。现在使用的是Lucene 2.3.2 + Highlighter 2.2.0。
主要代码实现:
CreateIndex:创建索引文件
- package
demo; -
- import
java.io.BufferedReader; - import
java.io.File; - import
java.io.FileInputStream; - import
java.io.IOException; - import
java.io.InputStreamReader; - import
java.util.Date; -
- import
net.paoding.analysis.analyzer.PaodingAnalyzer; -
- import
org.apache.lucene.analysis.Analyzer; - import
org.apache.lucene.document.Document; - import
org.apache.lucene.document.Field; - import
org.apache.lucene.index.IndexWriter; -
-
-
- public
class CreateIndex { -
-
public void createIndex() throws Exception { -
-
File surceFileDir = new File(“D:\\save\\source”); -
-
-
File indexFileDir = new File(“D:\\save”); -
-
//Analyzer luceneAnalyzer = new StandardAnalyzer(); -
Analyzer luceneAnalyzer = new PaodingAnalyzer();//使用庖丁解牛分词法 -
-
IndexWriter indexWriter = new IndexWriter(indexFileDir, luceneAnalyzer, true);///参数isEmpty是false表示增量索引 -
File[] sourceFextFiles = surceFileDir.listFiles(); -
long startTime = new Date().getTime(); -
-
// 增加document到索引去 -
for (int i = 0; i < sourceFextFiles.length; i++) { -
if (sourceFextFiles[i].isFile() -
&& sourceFextFiles[i].getName().endsWith(“.txt”)) { -
System.out.println(“File “ + sourceFextFiles[i].getCanonicalPath() + “正在被索引….”); -
String temp = FileReaderAll(sourceFextFiles[i].getCanonicalPath(), “GBK”); -
System.out.println(temp); -
Document document = new Document(); -
Field FieldPath = new Field(“path”, sourceFextFiles[i].getPath(), Field.Store.YES, Field.Index.NO); -
Field FieldBody = new Field(“body”, temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); -
Field FieldTitle = new Field(“title”, temp, Field.Store.YES, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS_OFFSETS); -
document.add(FieldPath); -
document.add(FieldBody);document.add(FieldTitle); -
indexWriter.addDocument(document); -
} -
} -
// optimize()方法是对索引进行优化 -
indexWriter.optimize(); -
indexWriter.close(); -
-
// 测试一下索引的时间 -
long endTime = new Date().getTime(); -
System.out.println(“这花费了” + (endTime - startTime) + ” 毫秒来把文档增加到索引里面去!” -
+ indexFileDir.getPath()); -
} -
-
public static String FileReaderAll(String FileName, String charset) -
throws IOException { -
BufferedReader reader = new BufferedReader(new InputStreamReader( -
new FileInputStream(FileName), charset)); -
String line = new String(); -
String temp = new String(); -
-
while ((line = reader.readLine()) != null) { -
temp += line; -
} -
reader.close(); -
return temp; -
} -
-
-
public static void main(String[] args) { -
try { -
new CreateIndex().createIndex(); -
} catch (Exception e) { -
e.printStackTrace(); -
} -
-
} -
- }
QueryHighLighter:检索关键字并高亮显示
- package
demo; -
- import
java.io.StringReader; -
- import
net.paoding.analysis.analyzer.PaodingAnalyzer; -
- import
org.apache.lucene.analysis.Analyzer; - import
org.apache.lucene.analysis.TokenStream; - import
org.apache.lucene.document.Document; - import
org.apache.lucene.queryParser.QueryParser; - import
org.apache.lucene.search.BooleanClause; - import
org.apache.lucene.search.IndexSearcher; - import
org.apache.lucene.search.Query; - import
org.apache.lucene.search.ScoreDoc; - import
org.apache.lucene.search.TopDocCollector; - import
org.apache.lucene.search.highlight.Highlighter; - import
org.apache.lucene.search.highlight.QueryScorer; - import
org.apache.lucene.search.highlight.SimpleFragmenter; - import
org.apache.lucene.search.highlight.SimpleHTMLFormatter; -
- import
test.TestLuceneHighlighter2; -
-
-
- public
class QueryHighLighter { -
-
private static final String FIELD_TITLE = “title”; -
-
private static final String FIELD_BODY = “body”; -
-
public synchronized Analyzer getAnalyzer() { -
return new PaodingAnalyzer();// 此处使用”庖丁解牛”分词法,另外一种是中科院分词法 -
} -
-
public String test(String queryString, int begin, int number) { -
StringBuffer sb = new StringBuffer(); -
IndexSearcher isearcher = null; -
try { -
isearcher = new IndexSearcher(“D:\\save”); -
-
BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, -
BooleanClause.Occur.SHOULD }; -
TopDocCollector collector = new TopDocCollector(10); -
-
QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); -
Query query = queryParse.parse(queryString); -
-
isearcher.search(query, collector); -
ScoreDoc[] hits = collector.topDocs().scoreDocs; -
// 用这个进行高亮显示,默认是<b>..</b> -
// 用这个指定<read>..</read> -
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(“<b><font color=’red’>”, “</font></b>”); -
// 构造高亮 -
// 指定高亮的格式 -
// 指定查询评分 -
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); -
// 这个一般等于你要返回的,高亮的数据长度 -
// 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 -
// 太大,有时太浪费了。 -
highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); -
for (int i = begin; i < hits.length && i < begin + number; i++) { -
Document doc = isearcher.doc(hits[i].doc); -
String value = doc.get(FIELD_TITLE); -
String value2 = doc.get(FIELD_BODY); -
// 有三个参数 -
// 分析器 -
// 要解析的字段名 -
// 要解析的数据 -
//System.out.println(highlighter.getBestFragment(getAnalyzer(), -
// FIELD_TITLE, doc.get(FIELD_TITLE))); -
-
if (value != null) { -
TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); -
String str = highlighter.getBestFragment(tokenStream, value); -
sb.append(“<li><li>”).append(str).append(“<br/>”); -
System.out.println(str); -
} -
-
} -
} catch (Exception e) { -
e.printStackTrace(); -
} finally { -
if (isearcher != null) { -
try { -
isearcher.close(); -
} catch (Exception e) { -
e.printStackTrace(); -
} -
} -
} -
return sb.toString(); -
} -
-
public static void main(String[] args){ -
TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); -
String queryString = “中华人民共和国”; -
int begin = 0; -
int number = 10; -
t.test(queryString, begin, number); -
} -
- }
-
- package
demo; -
- import
java.io.StringReader; -
- import
net.paoding.analysis.analyzer.PaodingAnalyzer; -
- import
org.apache.lucene.analysis.Analyzer; - import
org.apache.lucene.analysis.TokenStream; - import
org.apache.lucene.document.Document; - import
org.apache.lucene.queryParser.QueryParser; - import
org.apache.lucene.search.BooleanClause; - import
org.apache.lucene.search.IndexSearcher; - import
org.apache.lucene.search.Query; - import
org.apache.lucene.search.ScoreDoc; - import
org.apache.lucene.search.TopDocCollector; - import
org.apache.lucene.search.highlight.Highlighter; - import
org.apache.lucene.search.highlight.QueryScorer; - import
org.apache.lucene.search.highlight.SimpleFragmenter; - import
org.apache.lucene.search.highlight.SimpleHTMLFormatter; -
- import
test.TestLuceneHighlighter2; -
-
-
- public
class QueryHighLighter { -
-
private static final String FIELD_TITLE = “title”; -
-
private static final String FIELD_BODY = “body”; -
-
public synchronized Analyzer getAnalyzer() { -
return new PaodingAnalyzer();// 此处使用”庖丁解牛”分词法,另外一种是中科院分词法 -
} -
-
public String test(String queryString, int begin, int number) { -
StringBuffer sb = new StringBuffer(); -
IndexSearcher isearcher = null; -
try { -
isearcher = new IndexSearcher(“D:\\save”); -
-
BooleanClause.Occur[] clauses = { BooleanClause.Occur.SHOULD, -
BooleanClause.Occur.SHOULD }; -
TopDocCollector collector = new TopDocCollector(10); -
-
QueryParser queryParse = new QueryParser(FIELD_TITLE, getAnalyzer()); -
Query query = queryParse.parse(queryString); -
-
isearcher.search(query, collector); -
ScoreDoc[] hits = collector.topDocs().scoreDocs; -
// 用这个进行高亮显示,默认是<b>..</b> -
// 用这个指定<read>..</read> -
SimpleHTMLFormatter simpleHTMLFormatter = new SimpleHTMLFormatter(“<b><font color=’red’>”, “</font></b>”); -
// 构造高亮 -
// 指定高亮的格式 -
// 指定查询评分 -
Highlighter highlighter = new Highlighter(simpleHTMLFormatter, new QueryScorer(query)); -
// 这个一般等于你要返回的,高亮的数据长度 -
// 如果太小,则只有数据的开始部分被解析并高亮,且返回的数据也少 -
// 太大,有时太浪费了。 -
highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE)); -
for (int i = begin; i < hits.length && i < begin + number; i++) { -
Document doc = isearcher.doc(hits[i].doc); -
String value = doc.get(FIELD_TITLE); -
String value2 = doc.get(FIELD_BODY); -
// 有三个参数 -
// 分析器 -
// 要解析的字段名 -
// 要解析的数据 -
//System.out.println(highlighter.getBestFragment(getAnalyzer(), -
// FIELD_TITLE, doc.get(FIELD_TITLE))); -
-
if (value != null) { -
TokenStream tokenStream = getAnalyzer().tokenStream(FIELD_TITLE, new StringReader(value)); -
String str = highlighter.getBestFragment(tokenStream, value); -
sb.append(“<li><li>”).append(str).append(“<br/>”); -
System.out.println(str); -
} -
-
} -
} catch (Exception e) { -
e.printStackTrace(); -
} finally { -
if (isearcher != null) { -
try { -
isearcher.close(); -
} catch (Exception e) { -
e.printStackTrace(); -
} -
} -
} -
return sb.toString(); -
} -
-
public static void main(String[] args){ -
TestLuceneHighlighter2 t = new TestLuceneHighlighter2(); -
String queryString = “中华人民共和国”; -
int begin = 0; -
int number = 10; -
t.test(queryString, begin, number); -
} -
- }
-
附加上传net\paoding\analysis\Constants.java便于理解参数设置:
- package
net.paoding.analysis; -
- import
java.util.HashMap; - import
java.util.Map; - import
java.util.Properties; -
-
- public
class Constants { -
-
-
public static final String DIC_HOME_CONFIG_FIRST = “paoding.dic.home.config-first”; -
public static final String DIC_HOME_CONFIG_FIRST_DEFAULT = “system-env”; -
-
-
public static final String ENV_PAODING_DIC_HOME = “PAODING_DIC_HOME”; -
-
// ————————————————————- -
-
public static final String DIC_HOME = “paoding.dic.home”; -
public static final String DIC_HOME_DEFAULT = null; -
-
// ————————————————————- -
// -
public static final String DIC_CHARSET = “paoding.dic.charset”; -
public static final String DIC_CHARSET_DEFAULT = “UTF-8″; -
-
// ————————————————————- -
// dictionaries which are skip -
public static final String DIC_SKIP_PREFIX = “paoding.dic.skip.prefix”; -
public static final String DIC_SKIP_PREFIX_DEFAULT = “x-“; -
-
// ————————————————————- -
// chinese/cjk charactors that will not token -
public static final String DIC_NOISE_CHARACTOR = “paoding.dic.noise-charactor”; -
public static final String DIC_NOISE_CHARACTOR_DEFAULT = “x-noise-charactor”; -
-
// ————————————————————- -
// chinese/cjk words that will not token -
public static final String DIC_NOISE_WORD = “paoding.dic.noise-word”; -
public static final String DIC_NOISE_WORD_DEFAULT = “x-noise-word”; -
-
// ————————————————————- -
// unit words, like “ge”, “zhi”, … -
public static final String DIC_UNIT = “paoding.dic.unit”; -
public static final String DIC_UNIT_DEFAULT = “x-unit”; -
-
// ————————————————————- -
// like “Wang”, “Zhang”, … -
public static final String DIC_CONFUCIAN_FAMILY_NAME = “paoding.dic.confucian-family-name”; -
public static final String DIC_CONFUCIAN_FAMILY_NAME_DEFAULT = “x-confucian-family-name”; -
-
// ————————————————————- -
// like -
public static final String DIC_FOR_COMBINATORICS = “paoding.dic.for-combinatorics”; -
public static final String DIC_FOR_COMBINATORICS_DEFAULT = “x-for-combinatorics”; -
-
// ————————————————————- -
// like -
public static final String DIC_DETECTOR_INTERVAL = “paoding.dic.detector.interval”; -
public static final String DIC_DETECTOR_INTERVAL_DEFAULT = “60”; -
-
// ————————————————————- -
// like “default”, “max”, … -
public static final String ANALYZER_MODE = “paoding.analyzer.mode”; -
public static final String ANALYZER_MOE_DEFAULT = “most-words”; -
-
// ————————————————————- -
// -
public static final String ANALYZER_DICTIONARIES_COMPILER = “paoding.analyzer.dictionaries.compiler”; -
public static final String ANALYZER_DICTIONARIES_COMPILER_DEFAULT = null; -
-
// ————————————————————- -
private static final Mapmap = new HashMap(); -
-
static { -
map.put(DIC_HOME_CONFIG_FIRST, DIC_HOME_CONFIG_FIRST_DEFAULT); -
map.put(DIC_HOME, DIC_HOME_DEFAULT); -
map.put(DIC_CHARSET, DIC_CHARSET_DEFAULT); -
map.put(DIC_SKIP_PREFIX, DIC_SKIP_PREFIX_DEFAULT); -
map.put(DIC_NOISE_CHARACTOR, DIC_NOISE_CHARACTOR_DEFAULT); -
map.put(DIC_NOISE_WORD, DIC_NOISE_WORD_DEFAULT); -
map.put(DIC_UNIT, DIC_UNIT_DEFAULT); -
map.put(DIC_CONFUCIAN_FAMILY_NAME, DIC_CONFUCIAN_FAMILY_NAME_DEFAULT); -
map.put(DIC_FOR_COMBINATORICS, DIC_FOR_COMBINATORICS_DEFAULT); -
map.put(DIC_DETECTOR_INTERVAL, DIC_DETECTOR_INTERVAL_DEFAULT); -
map.put(ANALYZER_MODE, ANALYZER_MOE_DEFAULT); -
map.put(ANALYZER_DICTIONARIES_COMPILER, ANALYZER_DICTIONARIES_COMPILER_DEFAULT); -
} -
-
// -
public static final String KNIFE_CLASS = “paoding.knife.class.”; -
-
public static String getProperty(Properties p, String name) { -
return p.getProperty(name, (String) map.get(name)); -
} - }
发表评论