*) using Apache POI library to parse Word documents now

*) removed tm-extractors library (can be found at http://www.textmining.org/ if necessary again)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6193 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
low012 16 years ago
parent caedd72400
commit f242e7d7bc

Binary file not shown.

@ -31,14 +31,13 @@ import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.UnsupportedEncodingException; import java.io.UnsupportedEncodingException;
import java.util.HashMap; import java.util.HashMap;
import org.textmining.extraction.TextExtractor;
import org.textmining.extraction.word.WordTextExtractorFactory;
import de.anomic.document.AbstractParser; import de.anomic.document.AbstractParser;
import de.anomic.document.Idiom; import de.anomic.document.Idiom;
import de.anomic.document.ParserException; import de.anomic.document.ParserException;
import de.anomic.document.Document; import de.anomic.document.Document;
import de.anomic.yacy.yacyURL; import de.anomic.yacy.yacyURL;
import org.apache.poi.hwpf.extractor.WordExtractor;
public class docParser extends AbstractParser implements Idiom { public class docParser extends AbstractParser implements Idiom {
@ -65,20 +64,25 @@ public class docParser extends AbstractParser implements Idiom {
} }
public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException { public Document parse(final yacyURL location, final String mimeType, final String charset, final InputStream source) throws ParserException, InterruptedException {
final WordTextExtractorFactory extractorFactory = new WordTextExtractorFactory();
TextExtractor extractor = null; final WordExtractor extractor;
try { try {
extractor = extractorFactory.textExtractor(source); extractor = new WordExtractor(source);
} catch (Exception e) { } catch (IOException e) {
throw new ParserException("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location); throw new ParserException("error in docParser, WordTextExtractorFactory: " + e.getMessage(), location);
} }
String contents = null;
StringBuilder contents = new StringBuilder();
try { try {
contents = extractor.getText().trim(); contents.append(extractor.getText().trim());
} catch (IOException e) { contents.append(extractor.getHeaderText());
contents.append(extractor.getFooterText());
} catch (Exception e) {
throw new ParserException("error in docParser, getText: " + e.getMessage(), location); throw new ParserException("error in docParser, getText: " + e.getMessage(), location);
} }
String title = contents.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim(); String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
if (title.length() > 80) title = title.substring(0, 80); if (title.length() > 80) title = title.substring(0, 80);
int l = title.length(); int l = title.length();
while (true) { while (true) {
@ -86,6 +90,7 @@ public class docParser extends AbstractParser implements Idiom {
if (title.length() == l) break; if (title.length() == l) break;
l = title.length(); l = title.length();
} }
Document theDoc; Document theDoc;
try { try {
theDoc = new Document( theDoc = new Document(
@ -98,7 +103,7 @@ public class docParser extends AbstractParser implements Idiom {
"", // TODO: AUTHOR "", // TODO: AUTHOR
null, null,
null, null,
contents.getBytes("UTF-8"), contents.toString().getBytes("UTF-8"),
null, null,
null); null);
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {

Loading…
Cancel
Save