view plain 
    
import java.io.IOException;  
import java.util.ArrayList;  
import java.util.Collection;  
  
import org.apache.solr.client.solrj.SolrQuery;  
import org.apache.solr.client.solrj.SolrServer;  
import org.apache.solr.client.solrj.SolrServerException;  
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;  
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;  
import org.apache.solr.client.solrj.request.UpdateRequest;  
import org.apache.solr.client.solrj.response.QueryResponse;  
import org.apache.solr.common.SolrInputDocument;  
  
public class SolrjTest {  
  
    public static void main(String[] args) throws IOException,  
            SolrServerException {  
  
        String urlString = " http://localhost:8080/solr";  
        SolrServer server = new CommonsHttpSolrServer(urlString);  
  
        SolrInputDocument doc1 = new SolrInputDocument();  
        doc1.addField("id", 12);  
        doc1.addField("content", "my test is easy,测试solr");  
        SolrInputDocument doc2 = new SolrInputDocument();  
        doc2.addField("id", "solrj简单测试");  
        doc2.addField("content", "doc2");  
        Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>();  
        docs.add(doc1);  
        docs.add( doc2 );  
        server.add(docs);  
        UpdateRequest req = new UpdateRequest();  
        req.setAction(AbstractUpdateRequest.ACTION.COMMIT, false, false);  
        req.add(docs);  
        req.process(server);  
  
        SolrQuery query = new SolrQuery();  
  
        query.setQuery("test");  
        query.setHighlight(true).setHighlightSnippets(1);                                                     
        query.setParam("hl.fl", "content");  
  
        QueryResponse ret = server.query(query);  
  
        System.out.println(ret);  
    }  
}  

solrj要成功运行，需要导入下列包才行

From /dist：

apache-solr-solrj-3.1.0.jar

From /dist/solrj-lib：
commons-codec-1.4.jar
commons-httpclient-3.1.jar
jcl-over-slf4j-1.5.5.jar
slf4j-api-1.5.5.jar

下面这个包需要去官方下载，因为本人在solr3.1中是没发现这个jar包的，估计是在低版本中有
slf4j-jdk14-1.5.5.jar

solr从1.4版本开始，将apache Tika合并进来，Tika是一个内容抽取的工具集合(a toolkit for text extracting)。它集成了POI, Pdfbox 并且为文本抽取工作提供了一个统一的界面。solr中利用这个工具可以很简单实现对pdf、word等富文本的提取

我的是3.1版，在实现过程中，走了很多弯路，终于还是自己解决了，下面分享一下

 
     view plain 
    
package test;  
  
import java.io.File;  
import java.io.IOException;  
import org.apache.solr.client.solrj.SolrServer;  
import org.apache.solr.client.solrj.SolrServerException;  
  
import org.apache.solr.client.solrj.request.AbstractUpdateRequest;  
import org.apache.solr.client.solrj.response.QueryResponse;  
import org.apache.solr.client.solrj.SolrQuery;  
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;  
import org.apache.solr.client.solrj.request.ContentStreamUpdateRequest;  
  
/** 
 * @author aidy 2011.6.9 
 */  
public class SolrExampleTests {  
  
  public static void main(String[] args) {  
    try {  
      //Solr cell can also index MS file (2003 version and 2007 version) types.  
      String fileName = "D://test//luceneTest//1.pdf";   
      //this will be unique Id used by Solr to index the file contents.  
      String solrId = "1.pdf";   
        
      indexFilesSolrCell(fileName, solrId);  
        
    } catch (Exception ex) {  
      System.out.println(ex.toString());  
    }  
  }  
    
  /** 
   * Method to index all types of files into Solr.  
   * @param fileName 
   * @param solrId 
   * @throws IOException 
   * @throws SolrServerException 
   */  
  public static void indexFilesSolrCell(String fileName, String solrId)   
    throws IOException, SolrServerException {  
      
    String urlString = "http://localhost:8080/solr";   
    SolrServer solr = new CommonsHttpSolrServer(urlString);  
      
    ContentStreamUpdateRequest up   
      = new ContentStreamUpdateRequest("/update/extract");  
      
    up.addFile(new File(fileName));  
      
    up.setParam("literal.id", solrId);  
    up.setParam("fmap.content", "attr_content");  
      
    up.setAction(AbstractUpdateRequest.ACTION.COMMIT, true, true);  
      
    solr.request(up);  
      
    QueryResponse rsp = solr.query(new SolrQuery("*:*"));  
      
    System.out.println(rsp);  
  }  
}  

刚开始一直在solr.request(up)这一步报错，看tomcat报错是说没有ignored_meta类型，刚开始一直不理解，因为我的配置文件schema.xml中根本没有这种类型，刚开始还以为是版本原因导致，专门去下了solr1.4版，运行果然不报错，后来才想到是因为前面在入门例子中，我修改了配置文件schema.xml，而solrconfig.xml配置文件在/update/extract节点处，有ignored_类型引用，后来我在schema.xml加入ignored_类型后，运行正常

后面研究一下如何用solrj进行查询，并将查询结果展示在web页面上，因为查询结果返回的是xml形式

如果solr是1.3版本或以下，请参考：http://wiki.apache.org/solr/UpdateRichDocuments

参考资料：

1.http://wiki.apache.org/solr/ExtractingRequestHandler
2.http://www.lucidimagination.com/Community/Hear-from-the-Experts/Articles/Content-Extraction-Tika

转自：http://www.cnblogs.com/ibook360/archive/2011/11/01/2231477

知识点

相关文章

最近更新

Apache Solr索引富文本（html word pdf）

相关问答

java使用openoffice将word转换为pdf的问题[2022-08-19]

在C＃客户端的Solr中索引pdf文档(Index pdf documents in Solr from C# client)[2022-02-13]

是否有可能通过页面提取使用Apache Tika的word / pdf文件的文本？(Is it possible to extract text by page for word/pdf files using Apache Tika?)[2022-01-08]

如何使用Apache POI从PDF中提取原始文本？(How can I extract raw text from PDFs using Apache POI?)[2023-04-09]

将Apache TIKA和Solr Cell与Solr集成以索引pdf和word文档(Integrate Apache TIKA and Solr Cell with Solr to index pdf and word documents)[2023-11-09]

Zend PDF包装(Zend PDF word wrapping)[2021-10-19]

用于将MS Word文件与字段和启动宏转换为PDF的Java API(Java API for converting MS Word file with fields, and start-up macro to PDF)[2021-09-04]

从Apache Solr中提取PDF(Extracting PDF from Apache Solr)[2023-10-02]

如何将pdf / word /文本文件发送到服务器(How to send a pdf/word/text file to server)[2023-11-03]

索引PDF - 使用Apache Solr和Apache Tika进行分面搜索(Indexing PDF - Faceted Search with Apache Solr and Apache Tika)[2023-04-14]