小菜鸟我最近研究了一下lucene,以及前面的爬虫的写法,我想到能否用lucene写一个站内搜索,由于我对htmlprase不是很了解,对字符串的处理有点不行,但是结果是可以的。
package LuceneSpider;
import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.HashSet; import java.util.LinkedList; import java.util.Set; import org.apache.commons.httpclient.DefaultHttpMethodRetryHandler; import org.apache.commons.httpclient.HttpClient; import org.apache.commons.httpclient.HttpException; import org.apache.commons.httpclient.HttpStatus; import org.apache.commons.httpclient.methods.GetMethod; import org.apache.commons.httpclient.params.HttpMethodParams; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.NumberTools; import org.apache.lucene.document.Field.Index; import org.apache.lucene.document.Field.Store; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriter.MaxFieldLength; import org.apache.lucene.queryParser.MultiFieldQueryParser; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.LockObtainFailedException; import org.htmlparser.Node; import org.htmlparser.NodeFilter; import org.htmlparser.Parser; import org.htmlparser.filters.NodeClassFilter; import org.htmlparser.filters.OrFilter; import org.htmlparser.tags.LinkTag; import org.htmlparser.util.NodeIterator; import org.htmlparser.util.NodeList; import org.htmlparser.util.ParserException; public class LuceneSpider { /** * 使用种子初始化url队列 */ private Set<String> visitedUrlSet=new HashSet(); private LinkedList unvisitedUrlSet=new LinkedList(); String[] seeds; String line; String savepath; String encoding; int savenum; Analyzer analyzer; public LuceneSpider(String[] seeds,String line,String savepath,int savenum,Analyzer analyzer){ this.seeds=seeds; this.line=line; this.savepath=savepath; this.savenum=savenum; this.analyzer=analyzer; } public void init(){ Set<String> seedsSet=new HashSet<String>(); for(int i=0;i<seeds.length;i++){ seedsSet.add(seeds[i]); } addToUnvisitedUrlSet(seedsSet); } public void run() throws ParserException, HttpException, IOException { init(); for(int i=0;i<savenum;i++){ if(IsUnvisitedUrlSetEmpty()==false){ String url=getFirstFromVisitedUrSet(); catchPages(url); } } } public void catchPages(String url) throws ParserException, HttpException, IOException{ HttpClient httpClient=new HttpClient(); httpClient.getHttpConnectionManager().getParams().setConnectionTimeout(5000); GetMethod getMethod=new GetMethod(url); //生成getmthod对象并设置参数 //设置get请求超时5s getMethod.getParams().setParameter(HttpMethodParams.SO_TIMEOUT, 5000); //设置请求重试处理 getMethod.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler()); //执行http get请求 int statusCode; statusCode = httpClient.executeMethod(getMethod); if(statusCode!=HttpStatus.SC_OK){ System.err.print("Method faied:"+url+getMethod.getStatusLine()); }else{ encoding=getMethod.getResponseCharSet(); createIndex(url); addToVisitedUrlSet(url); addToUnvisitedUrlSet(getUrls(url)); System.out.println(unvisitedUrlSet.size()); } } private void createIndex(String url) throws CorruptIndexException, LockObtainFailedException, IOException, ParserException { // TODO Auto-generated method stub String content=""; content=getContentByUrl(url); Document doc = new Document(); //文件名称 doc.add(new Field("url", url, Store.YES, Index.NOT_ANALYZED)); //检索到的内容 doc.add(new Field("content",content, Store.YES, Index.ANALYZED)); System.out.println(url); IndexWriter indexWriter = new IndexWriter(savepath, analyzer, false, MaxFieldLength.LIMITED); indexWriter.addDocument(doc); indexWriter.close(); } /* * 通过url得到网页去除标签后的内容 */ private String getContentByUrl(String url) throws ParserException { // TODO Auto-generated method stub String content=""; Parser parser=new Parser(url); Node nodes=null; int j=0; for(NodeIterator iterator=parser.elements();iterator.hasMoreNodes();){ j+=1; nodes=iterator.nextNode(); content=content+nodes.toPlainTextString().replaceAll(" ","").replaceAll("\n", ""); } return content; } /* * 解析页面的url */ public Set<String> getUrls(String url) throws ParserException { Set<String> links=new HashSet<String>(); Parser parser=null; parser = new Parser(url); parser.setEncoding(encoding); NodeFilter frameFilter=new NodeFilter() { @Override public boolean accept(Node node) { // TODO Auto-generated method stub if(node.getText().startsWith("frame src=")){ return true; }else{ return false; } } }; OrFilter linkFilter=new OrFilter(new NodeClassFilter(LinkTag.class),frameFilter); if(parser!=null){ NodeList list=parser.extractAllNodesThatMatch(linkFilter); for(int i=0;i<list.size();i++){ Node tag=list.elementAt(i); if(tag instanceof LinkTag){ LinkTag link=(LinkTag)tag; String linkUrl=link.getLink(); if(frameFilter.accept(tag)){ //处理<frame> String frameTxt=tag.getText(); int start=frameTxt.indexOf("src="); frameTxt=frameTxt.substring(start); int end=frameTxt.indexOf(" "); if(end==-1){ end=frameTxt.indexOf(">"); } String frameUrl=frameTxt.substring(5,end-1); if(LinkFilter(frameUrl)) links.add(frameUrl); }else{ //处理<a> if(LinkFilter(linkUrl)){ links.add(linkUrl); } } } } } return links; } //爬虫遵循的线索 public boolean LinkFilter(String url){ if(url.startsWith(line)){ return true; }else{ return false; } } //网页名filter,不然会出现存储错误 public String getFileNameByUrl(String url,String contentType){ //移除http; url=url.substring(7); //text/html类型 if(contentType.indexOf("html")!=-1){ url=url.replaceAll("[\\?/:*|<>\"]", "_")+".html"; return url; }else{ return url.replaceAll("[\\?/:*|<>\"]","_")+"."+ contentType.substring(contentType.lastIndexOf("/")+1); } } public void addToVisitedUrlSet(String url){ visitedUrlSet.add(url); } public boolean IsUnvisitedUrlSetEmpty(){ boolean isEmpty=false; if(unvisitedUrlSet.isEmpty()){ isEmpty=true; } return isEmpty; } public void addToUnvisitedUrlSet(Set<String> urls){ for (String url : urls) { if(!isVisited(url)){ unvisitedUrlSet.add(url); } } } public boolean isVisited(String url){ boolean isVisited=false; for (String visitedUrl : visitedUrlSet) { if(visitedUrl.equals(url)){ isVisited=true; } } return isVisited; } public String getFirstFromVisitedUrSet(){ String url=unvisitedUrlSet.getFirst().toString(); unvisitedUrlSet.removeFirst(); return url; } public void search(String about) throws Exception { //请求字段 //String queryString = "document"; //String queryString = "IndexWriter document a javadoc.txt"; // 1,把要搜索的文本解析为 Query String[] fields = { "url", "content" }; QueryParser queryParser = new MultiFieldQueryParser(fields, analyzer); Query query = queryParser.parse(about); // 2,进行查询,从索引库中查找 IndexSearcher indexSearcher = new IndexSearcher(savepath); Filter filter = null; TopDocs topDocs = indexSearcher.search(query, filter, 10000); System.out.println("总共有【" + topDocs.totalHits + "】条匹配结果"); // 3,打印结果 for (ScoreDoc scoreDoc : topDocs.scoreDocs) { // 文档内部编号 int index = scoreDoc.doc; // 根据编号取出相应的文档 Document doc = indexSearcher.doc(index); System.out.println("------------------------------"); System.out.println("url = " + doc.get("url")); // System.out.println("content = " + doc.get("content").replaceAll(" ","")); } } }package LuceneSpider;
import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; public class Run { /** * @param args */ public static void main(String[] args) { // TODO Auto-generated method stub String[] seeds={"http://localhost/openzone"}; String line="http://localhost"; String savepath="D:\\javaworkspace\\openzone"; int savenum=100; Analyzer analyzer=new StandardAnalyzer(); LuceneSpider luceneSpider=new LuceneSpider(seeds, line, savepath, savenum, analyzer); try { luceneSpider.run(); luceneSpider.search("合作站点"); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } } }