bugfixes in html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7912 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent b00e69c5df
commit 1c007188ad

@ -45,8 +45,11 @@ keywords
## character encoding, string ## character encoding, string
charset_s charset_s
## tags of css entries, normalized with absolute URL, textgen
attr_css_tag
## urls of css entries, normalized with absolute URL, textgen ## urls of css entries, normalized with absolute URL, textgen
attr_css attr_css_url
## number of css entries, int ## number of css entries, int
csscount_i csscount_i
@ -74,11 +77,24 @@ wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen ## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks_tag attr_inboundlinks_tag
attr_inboundlinks_protocol
attr_inboundlinks_urlstub ## internal links, only the protocol
attr_inboundlinks_name #attr_inboundlinks_protocol
attr_inboundlinks_rel
attr_inboundlinks_text ## internal links, the url only without the protocol
#attr_inboundlinks_urlstub
## internal links, the name property of the a-tag
#attr_inboundlinks_name
## internal links, the rel property of the a-tag
#attr_inboundlinks_rel
## internal links, the rel property of the a-tag, coded binary
#attr_inboundlinks_relcode
## internal links, the text content of the a-tag
#attr_inboundlinks_text
## total number of inbound links, int ## total number of inbound links, int
inboundlinkscount_i inboundlinkscount_i
@ -88,18 +104,43 @@ inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen ## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks_tag attr_outboundlinks_tag
attr_outboundlinks_protocol
attr_outboundlinks_urlstub
attr_outboundlinks_name
attr_outboundlinks_rel
attr_outboundlinks_text
## total number of external links, int ## external links, only the protocol
outboundlinkscount_i #attr_outboundlinks_protocol
## external links, the url only without the protocol
#attr_outboundlinks_urlstub
## external links, the name property of the a-tag
#attr_outboundlinks_name
## external links, the rel property of the a-tag
#attr_outboundlinks_rel
## external links, the text content of the a-tag
#attr_outboundlinks_text
## external number of inbound links, int
outboundlinks_i
## number of external links with noindex tag, int ## number of external links with noindex tag, int
outboundlinksnoindexcount_i outboundlinksnoindexcount_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
attr_images_tag
## all image links without the protocol and '://'
#attr_images_urlstub
## all image link protocols
#attr_images_protocol
## all image link alt tag
#attr_images_alt
## number of images, int
imagescount_i
## h1 header, textgen ## h1 header, textgen
attr_h1 attr_h1
@ -154,12 +195,6 @@ attr_italiccount
## total number of occurrences of <i>, int ## total number of occurrences of <i>, int
italic_i italic_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
attr_images
## number of images, int
imagescount_i
## flag that shows if a swf file is linked, boolean ## flag that shows if a swf file is linked, boolean
flash_b flash_b
@ -205,6 +240,12 @@ attr_tracker
## number of attribute counts in attr_tracker, textgen ## number of attribute counts in attr_tracker, textgen
attr_trackercount attr_trackercount
## names matching title expressions, textgen
attr_title
## number of matching title expressions, textgen
attr_titlecount
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text ## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t failreason_t

@ -128,76 +128,75 @@ public class SolrScheme extends ConfigurationSet {
int c = 0; int c = 0;
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount()); if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount()); if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains("attr_inboundlinks")) { final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()]; final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
for (final MultiProtocolURI url: yacydoc.inboundLinks()) { final Properties p = alllinks.get(url);
final Properties p = alllinks.get(url); final String name = p.getProperty("name", ""); // the name attribute
final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String text = p.getProperty("text", ""); // the text between the <a></a> tag final String urls = url.toNormalform(false, false);
final String urls = url.toNormalform(false, false); final int pr = urls.indexOf("://");
final int pr = urls.indexOf("://"); inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLProtocol[c] = urls.substring(0, pr); inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksURLStub[c] = urls.substring(pr + 3); inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksName[c] = name.length() > 0 ? name : ""; inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : ""; inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksText[c] = text.length() > 0 ? rel : ""; inboundlinksTag[c] =
inboundlinksTag[c] = "<a href=\"" + url.toNormalform(false, false) + "\"" +
"<a href=\"" + url.toNormalform(false, false) + "\"" + (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" + ">" +
((name.length() > 0) ? name : "") + "</a>"; ((text.length() > 0) ? text : "") + "</a>";
c++; c++;
}
addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
} }
if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
c = 0; c = 0;
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount()); if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount()); if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains("attr_outboundlinks")) { final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()]; final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()]; for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
for (final MultiProtocolURI url: yacydoc.outboundLinks()) { final Properties p = alllinks.get(url);
final Properties p = alllinks.get(url); final String name = p.getProperty("name", ""); // the name attribute
final String name = p.getProperty("name", ""); // the name attribute final String rel = p.getProperty("rel", ""); // the rel-attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String text = p.getProperty("text", ""); // the text between the <a></a> tag final String urls = url.toNormalform(false, false);
final String urls = url.toNormalform(false, false); final int pr = urls.indexOf("://");
final int pr = urls.indexOf("://"); outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLProtocol[c] = urls.substring(0, pr); outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksURLStub[c] = urls.substring(pr + 3); outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksName[c] = name.length() > 0 ? name : ""; outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : ""; outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksText[c] = text.length() > 0 ? rel : ""; outboundlinksTag[c] =
outboundlinksTag[c] = "<a href=\"" + url.toNormalform(false, false) + "\"" +
"<a href=\"" + url.toNormalform(false, false) + "\"" + (rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") + (name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" + ">" +
((name.length() > 0) ? name : "") + "</a>"; ((text.length() > 0) ? text : "") + "</a>";
c++; c++;
}
addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
} }
if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
// charset // charset
addSolr(solrdoc, "charset_s", yacydoc.getCharset()); addSolr(solrdoc, "charset_s", yacydoc.getCharset());
@ -255,27 +254,43 @@ public class SolrScheme extends ConfigurationSet {
if (li.length > 0) addSolr(solrdoc, "attr_li", li); if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images // images
if (isEmpty() || contains("attr_images")) { final Collection<ImageEntry> imagesc = html.getImages().values();
final Collection<ImageEntry> imagesc = html.getImages().values(); final String[] imgtags = new String[imagesc.size()];
final String[] images = new String[imagesc.size()]; final String[] imgprots = new String[imagesc.size()];
c = 0; final String[] imgstubs = new String[imagesc.size()];
for (final ImageEntry ie: imagesc) images[c++] = ie.toString(); final String[] imgalts = new String[imagesc.size()];
addSolr(solrdoc, "imagescount_i", images.length); c = 0;
if (images.length > 0) addSolr(solrdoc, "attr_images", images); for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
imgalts[c] = ie.alt();
c++;
} }
addSolr(solrdoc, "imagescount_i", imgtags.length);
if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
// style sheets // style sheets
if (isEmpty() || contains("attr_css")) { if (isEmpty() || contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS(); final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css = new String[csss.size()]; final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0; c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) { for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
css[c++] = final String url = entry.getKey().toNormalform(false, false, false, false);
css_tag[c] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" + "<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />"; " href=\""+ url + "\" />";
css_url[c] = url;
c++;
} }
addSolr(solrdoc, "csscount_i", css.length); addSolr(solrdoc, "csscount_i", css_tag.length);
if (css.length > 0) addSolr(solrdoc, "attr_css", css); if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
} }
// Scripts // Scripts

@ -447,6 +447,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.headlines[5].add(h); if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) { } else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
this.title = recursiveParse(text); this.title = recursiveParse(text);
this.evaluationScores.match(Element.title, this.title);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) { } else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h); if (h.length() > 0) this.bold.inc(h);

@ -62,6 +62,7 @@ public class Evaluation {
public static enum Element { public static enum Element {
text, text,
title,
bodyclass, bodyclass,
divid, divid,
csspath, csspath,

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public * modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either * License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version. * version 2.1 of the License, or (at your option) any later version.
* *
* This library is distributed in the hope that it will be useful, * This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of * but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details. * Lesser General Public License for more details.
* *
* You should have received a copy of the GNU Lesser General Public License * You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt * along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>. * If not, see <http://www.gnu.org/licenses/>.
@ -34,8 +34,6 @@ import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException; import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern; import java.util.regex.Pattern;
import com.ibm.icu.text.CharsetDetector;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser; import net.yacy.document.AbstractParser;
@ -47,47 +45,49 @@ import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter; import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser { public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_"); private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser() { public htmlParser() {
super("HTML Parser"); super("HTML Parser");
SUPPORTED_EXTENSIONS.add("htm"); this.SUPPORTED_EXTENSIONS.add("htm");
SUPPORTED_EXTENSIONS.add("html"); this.SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("phtml"); this.SUPPORTED_EXTENSIONS.add("phtml");
SUPPORTED_EXTENSIONS.add("shtml"); this.SUPPORTED_EXTENSIONS.add("shtml");
SUPPORTED_EXTENSIONS.add("xhtml"); this.SUPPORTED_EXTENSIONS.add("xhtml");
SUPPORTED_EXTENSIONS.add("php"); this.SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_EXTENSIONS.add("php3"); this.SUPPORTED_EXTENSIONS.add("php3");
SUPPORTED_EXTENSIONS.add("php4"); this.SUPPORTED_EXTENSIONS.add("php4");
SUPPORTED_EXTENSIONS.add("php5"); this.SUPPORTED_EXTENSIONS.add("php5");
SUPPORTED_EXTENSIONS.add("cfm"); this.SUPPORTED_EXTENSIONS.add("cfm");
SUPPORTED_EXTENSIONS.add("asp"); this.SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx"); this.SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("tex"); this.SUPPORTED_EXTENSIONS.add("tex");
SUPPORTED_EXTENSIONS.add("txt"); this.SUPPORTED_EXTENSIONS.add("txt");
//SUPPORTED_EXTENSIONS.add("js"); //SUPPORTED_EXTENSIONS.add("js");
SUPPORTED_EXTENSIONS.add("jsp"); this.SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("mf"); this.SUPPORTED_EXTENSIONS.add("mf");
SUPPORTED_EXTENSIONS.add("pl"); this.SUPPORTED_EXTENSIONS.add("pl");
SUPPORTED_EXTENSIONS.add("py"); this.SUPPORTED_EXTENSIONS.add("py");
SUPPORTED_MIME_TYPES.add("text/html"); this.SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/xhtml+xml"); this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml"); this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/x-httpd-php"); this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
SUPPORTED_MIME_TYPES.add("application/x-tex"); this.SUPPORTED_MIME_TYPES.add("application/x-tex");
SUPPORTED_MIME_TYPES.add("text/plain"); this.SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml"); this.SUPPORTED_MIME_TYPES.add("text/sgml");
SUPPORTED_MIME_TYPES.add("text/csv"); this.SUPPORTED_MIME_TYPES.add("text/csv");
} }
public static ContentScraper parseToScraper( public static ContentScraper parseToScraper(
final MultiProtocolURI location, final MultiProtocolURI location,
final String documentCharset, final String documentCharset,
InputStream sourceStream) throws Parser.Failure, IOException { InputStream sourceStream) throws Parser.Failure, IOException {
// make a scraper // make a scraper
String charset = null; String charset = null;
@ -95,72 +95,72 @@ public class htmlParser extends AbstractParser implements Parser {
if (documentCharset != null) { if (documentCharset != null) {
charset = patchCharsetEncoding(documentCharset); charset = patchCharsetEncoding(documentCharset);
} }
// nothing found: try to find a meta-tag // nothing found: try to find a meta-tag
if (charset == null) { if (charset == null) {
try { try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false); final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
sourceStream = htmlFilter; sourceStream = htmlFilter;
charset = htmlFilter.detectCharset(); charset = htmlFilter.detectCharset();
} catch (IOException e1) { } catch (final IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location); throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
} }
} }
// the author didn't tell us the encoding, try the mozilla-heuristic // the author didn't tell us the encoding, try the mozilla-heuristic
if (charset == null) { if (charset == null) {
CharsetDetector det = new CharsetDetector(); final CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true); det.enableInputFilter(true);
InputStream detStream = new BufferedInputStream(sourceStream); final InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream); det.setText(detStream);
charset = det.detect().getName(); charset = det.detect().getName();
sourceStream = detStream; sourceStream = detStream;
} }
// wtf? still nothing, just take system-standard // wtf? still nothing, just take system-standard
if (charset == null) { if (charset == null) {
charset = Charset.defaultCharset().name(); charset = Charset.defaultCharset().name();
} }
Charset c; Charset c;
try { try {
c = Charset.forName(charset); c = Charset.forName(charset);
} catch (IllegalCharsetNameException e) { } catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset(); c = Charset.defaultCharset();
} catch (UnsupportedCharsetException e) { } catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset(); c = Charset.defaultCharset();
} }
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location); final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
try { try {
FileUtils.copy(sourceStream, writer, c); FileUtils.copy(sourceStream, writer, c);
} catch (IOException e) { } catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location); throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally { } finally {
sourceStream.close(); sourceStream.close();
writer.close(); writer.close();
} }
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false); //OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos); //serverFileUtils.copy(sourceFile, hfos);
//hfos.close(); //hfos.close();
if (writer.binarySuspect()) { if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource"; final String errorMsg = "Binary data found in resource";
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
return scraper; return scraper;
} }
public Document[] parse( public Document[] parse(
final MultiProtocolURI location, final MultiProtocolURI location,
final String mimeType, final String mimeType,
final String documentCharset, final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException { final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try { try {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream)); return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
} catch (IOException e) { } catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location); throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
} }
} }
@ -197,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getRSS(), scraper.getRSS(),
scraper.getImages(), scraper.getImages(),
scraper.indexingDenied())}; scraper.indexingDenied())};
//scraper.close(); //scraper.close();
for (final Document ppd: ppds) { for (final Document ppd: ppds) {
ppd.setFavicon(scraper.getFavicon()); ppd.setFavicon(scraper.getFavicon());
} }
@ -214,10 +214,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @return patched encoding name * @return patched encoding name
*/ */
public static String patchCharsetEncoding(String encoding) { public static String patchCharsetEncoding(String encoding) {
// do nothing with null // do nothing with null
if ((encoding == null) || (encoding.length() < 3)) return null; if ((encoding == null) || (encoding.length() < 3)) return null;
// trim encoding string // trim encoding string
encoding = encoding.trim(); encoding = encoding.trim();
@ -228,7 +228,7 @@ public class htmlParser extends AbstractParser implements Parser {
// all other names but such with "windows" use uppercase // all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7); if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman"; if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters // fix wrong fill characters
encoding = patternUnderline.matcher(encoding).replaceAll("-"); encoding = patternUnderline.matcher(encoding).replaceAll("-");
@ -236,7 +236,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8"; if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
if (encoding.startsWith("US")) return "US-ASCII"; if (encoding.startsWith("US")) return "US-ASCII";
if (encoding.startsWith("KOI")) return "KOI8-R"; if (encoding.startsWith("KOI")) return "KOI8-R";
// patch missing '-' // patch missing '-'
if (encoding.startsWith("windows") && encoding.length() > 7) { if (encoding.startsWith("windows") && encoding.length() > 7) {
final char c = encoding.charAt(7); final char c = encoding.charAt(7);
@ -244,7 +244,7 @@ public class htmlParser extends AbstractParser implements Parser {
encoding = "windows-" + encoding.substring(7); encoding = "windows-" + encoding.substring(7);
} }
} }
if (encoding.startsWith("ISO")) { if (encoding.startsWith("ISO")) {
// patch typos // patch typos
if (encoding.length() > 3) { if (encoding.length() > 3) {
@ -256,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.length() > 8) { if (encoding.length() > 8) {
final char c = encoding.charAt(8); final char c = encoding.charAt(8);
if ((c >= '0') && (c <= '9')) { if ((c >= '0') && (c <= '9')) {
encoding = encoding.substring(0, 8) + "-" + encoding.substring(8); encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
} }
} }
} }
// patch wrong name // patch wrong name
if (encoding.startsWith("ISO-8559")) { if (encoding.startsWith("ISO-8559")) {
// popular typo // popular typo
@ -279,26 +279,26 @@ public class htmlParser extends AbstractParser implements Parser {
return encoding; return encoding;
} }
public static void main(String[] args) { public static void main(final String[] args) {
// test parsing of a url // test parsing of a url
MultiProtocolURI url; MultiProtocolURI url;
try { try {
url = new MultiProtocolURI(args[0]); url = new MultiProtocolURI(args[0]);
byte[] content = url.get(ClientIdentification.getUserAgent(), 3000); final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content)); final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
String title = document[0].dc_title(); final String title = document[0].dc_title();
System.out.println(title); System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false)); System.out.println(CharacterCoding.unicode2html(title, false));
} catch (MalformedURLException e) { } catch (final MalformedURLException e) {
e.printStackTrace(); e.printStackTrace();
} catch (IOException e) { } catch (final IOException e) {
e.printStackTrace(); e.printStackTrace();
} catch (Parser.Failure e) { } catch (final Parser.Failure e) {
e.printStackTrace(); e.printStackTrace();
} catch (InterruptedException e) { } catch (final InterruptedException e) {
e.printStackTrace(); e.printStackTrace();
} }
} }
} }

@ -147,7 +147,7 @@ public final class LoaderDispatcher {
FileUtils.copy(b, tmp); FileUtils.copy(b, tmp);
tmp.renameTo(targetFile); tmp.renameTo(targetFile);
} }
public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException { public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist); return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
} }
@ -274,7 +274,7 @@ public final class LoaderDispatcher {
if (response != null && response.getContent() != null) { if (response != null && response.getContent() != null) {
// we got something. Now check if we want to store that to the cache // we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache // first check looks if we want to store the content to the cache
if (!crawlProfile.storeHTCache()) { if (crawlProfile == null || !crawlProfile.storeHTCache()) {
// no caching wanted. Thats ok, do not write any message // no caching wanted. Thats ok, do not write any message
return response; return response;
} }
@ -294,7 +294,7 @@ public final class LoaderDispatcher {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
} }
private int protocolMaxFileSize(final DigestURI url) { private int protocolMaxFileSize(final DigestURI url) {
if (url.isHTTP() || url.isHTTPS()) if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE); return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);

@ -124,6 +124,8 @@ then
then then
ENABLEHUGEPAGES=1 ENABLEHUGEPAGES=1
fi fi
# the G1 GC is on by default in Java7, so we try that here as well
# JAVA_ARGS="$JAVA_ARGS -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC"
elif [ $OS = "SunOS" ] elif [ $OS = "SunOS" ]
then then
# the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin. # the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin.

Loading…
Cancel
Save