|
|
@ -28,7 +28,9 @@
|
|
|
|
package net.yacy.document.parser;
|
|
|
|
package net.yacy.document.parser;
|
|
|
|
|
|
|
|
|
|
|
|
import java.io.InputStream;
|
|
|
|
import java.io.InputStream;
|
|
|
|
|
|
|
|
import java.util.ArrayList;
|
|
|
|
import java.util.Date;
|
|
|
|
import java.util.Date;
|
|
|
|
|
|
|
|
import java.util.List;
|
|
|
|
|
|
|
|
|
|
|
|
import net.yacy.cora.document.id.AnchorURL;
|
|
|
|
import net.yacy.cora.document.id.AnchorURL;
|
|
|
|
import net.yacy.cora.util.CommonPattern;
|
|
|
|
import net.yacy.cora.util.CommonPattern;
|
|
|
@ -80,7 +82,7 @@ public class docParser extends AbstractParser implements Parser {
|
|
|
|
throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
|
|
|
|
throw new Parser.Failure("error in docParser, getText: " + e.getMessage(), location);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
|
|
|
|
String title = (contents.length() > 240) ? contents.substring(0,240) : contents.toString().trim();
|
|
|
|
title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
|
|
|
|
title = title.replaceAll("\r"," ").replaceAll("\n"," ").replaceAll("\t"," ").trim();
|
|
|
|
if (title.length() > 80) title = title.substring(0, 80);
|
|
|
|
if (title.length() > 80) title = title.substring(0, 80);
|
|
|
|
int l = title.length();
|
|
|
|
int l = title.length();
|
|
|
|
while (true) {
|
|
|
|
while (true) {
|
|
|
@ -97,6 +99,10 @@ public class docParser extends AbstractParser implements Parser {
|
|
|
|
keywlist = null;
|
|
|
|
keywlist = null;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
final String subject = extractor.getSummaryInformation().getSubject();
|
|
|
|
|
|
|
|
List<String> descriptions = new ArrayList<String>();
|
|
|
|
|
|
|
|
if (subject != null && !subject.isEmpty()) descriptions.add(subject);
|
|
|
|
|
|
|
|
|
|
|
|
Document[] docs;
|
|
|
|
Document[] docs;
|
|
|
|
docs = new Document[]{new Document(
|
|
|
|
docs = new Document[]{new Document(
|
|
|
|
location,
|
|
|
|
location,
|
|
|
@ -109,7 +115,7 @@ public class docParser extends AbstractParser implements Parser {
|
|
|
|
extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
|
|
|
|
extractor.getSummaryInformation().getAuthor(), // constuctor can handle null
|
|
|
|
extractor.getDocSummaryInformation().getCompany(), // publisher
|
|
|
|
extractor.getDocSummaryInformation().getCompany(), // publisher
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|
descriptions,
|
|
|
|
0.0f, 0.0f,
|
|
|
|
0.0f, 0.0f,
|
|
|
|
contents.toString(),
|
|
|
|
contents.toString(),
|
|
|
|
null,
|
|
|
|
null,
|
|
|
|