用tika来解析pdf,word,excle,txt,超链接

<!-- .apache.tikatika-parsers --><dependency><gr

用tika来解析pdf,word,excle,txt,超链接

	<!-- .apache.tika/tika-parsers --><dependency><groupId>org.apache.tika</groupId><artifactId>tika-parsers</artifactId><version>1.1</version></dependency>这是tika的解析包

下面是测试代码,都有效

package com.crsri.tika.tes;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.tika.Tika;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.BodyContentHandler;
import org.junit.Test;
import org.xml.sax.SAXException;
import com.crsri.TgdsmApplicationTests;
/**

  • Tika测试类
  • @author liufei

*/
public class TikaTest extends TgdsmApplicationTests{

 /*** 解析超链接* @throws MalformedURLException* @throws IOException* @throws TikaException*/@Testpublic void tikaTest1() throws MalformedURLException, IOException, TikaException {Tika tika = new Tika();String parseToString = tika.parseToString(new URL(""));System.out.println(parseToString);}/*** 解析doc文本* @throws MalformedURLException* @throws IOException* @throws TikaException*/@Testpublic void tikaTest2() throws MalformedURLException, IOException, TikaException {Tika tika = new Tika();File file = new File("D:\\caomao2.docx");String parseToString = tika.parseToString(file);System.out.println(parseToString);}/*** 解析excle文本* @throws MalformedURLException* @throws IOException* @throws TikaException*/@Testpublic void tikaTest3() throws MalformedURLException, IOException, TikaException {Tika tika = new Tika();File file = new File("D:\\工作簿1.xlsx");String parseToString = tika.parseToString(file);System.out.println(parseToString);}/*** 解析txt文本* @throws MalformedURLException* @throws IOException* @throws TikaException*/@Testpublic void tikaTest4() throws MalformedURLException, IOException, TikaException {Tika tika = new Tika();File file = new File("D:\\base64.txt");String parseToString = tika.parseToString(file);System.out.println(parseToString);}/*** 解析pdf文本* @throws MalformedURLException* @throws IOException* @throws TikaException*/@Testpublic void tikaTest5() throws MalformedURLException, IOException, TikaException {Tika tika = new Tika();File file = new File("F:\\猫眼\\UML_Reference_Manual.pdf");String parseToString = tika.parseToString(file);System.out.println(parseToString);}/*** 用特定api去解析正文的标题,类型,正文内容* @throws IOException* @throws SAXException* @throws TikaException*/
@Test
public void test10() throws IOException, SAXException, TikaException {FileInputStream  input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等BodyContentHandler textHandler=new BodyContentHandler();//获取内容Metadata matadata=new Metadata();//Metadata对象保存了作者,标题等元数据AutoDetectParser parser = new  AutoDetectParser();//当调用parser,AutoDetectParser会自动估计文档MIME类型,此处输入PDP文件,因此可以使用PDFParserParseContext context=new ParseContext();parser.parse(input, textHandler, matadata, context);//执行解析过程input.close();System.out.println("Title: "+matadata.get(Metadata.TITLE));System.out.println("Type: "+matadata.get(Metadata.TYPE));System.out.println("Body: "+textHandler.toString());//从textHandler打印正文
}/*** 用输入流的方式解析* @throws IOException* @throws SAXException* @throws TikaException*/
@Test
public void test11() throws IOException, SAXException, TikaException {FileInputStream  input=new FileInputStream(new File("D:\\窗前明月光.docx"));//可以写文件路径,pdf,word,html等Tika tika = new Tika();String parseToString = tika.parseToString(input);
}

}

发布者:admin,转转请注明出处:http://www.yc00.com/web/1691158092a508978.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信