bugfixes in html parser

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7912 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent b00e69c5df
commit 1c007188ad

@ -45,8 +45,11 @@ keywords
## character encoding, string
charset_s
## tags of css entries, normalized with absolute URL, textgen
attr_css_tag
## urls of css entries, normalized with absolute URL, textgen
attr_css
attr_css_url
## number of css entries, int
csscount_i
@ -74,11 +77,24 @@ wordcount_i
## internal links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_inboundlinks_tag
attr_inboundlinks_protocol
attr_inboundlinks_urlstub
attr_inboundlinks_name
attr_inboundlinks_rel
attr_inboundlinks_text
## internal links, only the protocol
#attr_inboundlinks_protocol
## internal links, the url only without the protocol
#attr_inboundlinks_urlstub
## internal links, the name property of the a-tag
#attr_inboundlinks_name
## internal links, the rel property of the a-tag
#attr_inboundlinks_rel
## internal links, the rel property of the a-tag, coded binary
#attr_inboundlinks_relcode
## internal links, the text content of the a-tag
#attr_inboundlinks_text
## total number of inbound links, int
inboundlinkscount_i
@ -88,18 +104,43 @@ inboundlinksnoindexcount_i
## external links, normalized (absolute URLs), as <a> - tag with anchor text and nofollow, textgen
attr_outboundlinks_tag
attr_outboundlinks_protocol
attr_outboundlinks_urlstub
attr_outboundlinks_name
attr_outboundlinks_rel
attr_outboundlinks_text
## total number of external links, int
outboundlinkscount_i
## external links, only the protocol
#attr_outboundlinks_protocol
## external links, the url only without the protocol
#attr_outboundlinks_urlstub
## external links, the name property of the a-tag
#attr_outboundlinks_name
## external links, the rel property of the a-tag
#attr_outboundlinks_rel
## external links, the text content of the a-tag
#attr_outboundlinks_text
## external number of inbound links, int
outboundlinks_i
## number of external links with noindex tag, int
outboundlinksnoindexcount_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
attr_images_tag
## all image links without the protocol and '://'
#attr_images_urlstub
## all image link protocols
#attr_images_protocol
## all image link alt tag
#attr_images_alt
## number of images, int
imagescount_i
## h1 header, textgen
attr_h1
@ -154,12 +195,6 @@ attr_italiccount
## total number of occurrences of <i>, int
italic_i
## all image tags, encoded as <img> tag inclusive alt- and title property, textgen
attr_images
## number of images, int
imagescount_i
## flag that shows if a swf file is linked, boolean
flash_b
@ -205,6 +240,12 @@ attr_tracker
## number of attribute counts in attr_tracker, textgen
attr_trackercount
## names matching title expressions, textgen
attr_title
## number of matching title expressions, textgen
attr_titlecount
## fail reason if a page was not loaded. if the page was loaded then this field is empty, text
failreason_t

@ -128,76 +128,75 @@ public class SolrScheme extends ConfigurationSet {
int c = 0;
if (isEmpty() || contains("inboundlinkscount_i")) addSolr(solrdoc, "inboundlinkscount_i", yacydoc.inboundLinkCount());
if (isEmpty() || contains("inboundlinksnoindexcount_i")) addSolr(solrdoc, "inboundlinksnoindexcount_i", yacydoc.inboundLinkNoindexCount());
if (isEmpty() || contains("attr_inboundlinks")) {
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? rel : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
c++;
}
addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
final String[] inboundlinksTag = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLProtocol = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksURLStub = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksName = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksRel = new String[yacydoc.inboundLinkCount()];
final String[] inboundlinksText = new String[yacydoc.inboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.inboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
inboundlinksURLProtocol[c] = urls.substring(0, pr);
inboundlinksURLStub[c] = urls.substring(pr + 3);
inboundlinksName[c] = name.length() > 0 ? name : "";
inboundlinksRel[c] = rel.length() > 0 ? rel : "";
inboundlinksText[c] = text.length() > 0 ? text : "";
inboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains("attr_inboundlinks_tag")) addSolr(solrdoc, "attr_inboundlinks_tag", inboundlinksTag);
if (isEmpty() || contains("attr_inboundlinks_protocol")) addSolr(solrdoc, "attr_inboundlinks_protocol", inboundlinksURLProtocol);
if (isEmpty() || contains("attr_inboundlinks_urlstub")) addSolr(solrdoc, "attr_inboundlinks_urlstub", inboundlinksURLStub);
if (isEmpty() || contains("attr_inboundlinks_name")) addSolr(solrdoc, "attr_inboundlinks_name", inboundlinksName);
if (isEmpty() || contains("attr_inboundlinks_rel")) addSolr(solrdoc, "attr_inboundlinks_rel", inboundlinksRel);
if (isEmpty() || contains("attr_inboundlinks_text")) addSolr(solrdoc, "attr_inboundlinks_text", inboundlinksText);
c = 0;
if (isEmpty() || contains("outboundlinkscount_i")) addSolr(solrdoc, "outboundlinkscount_i", yacydoc.outboundLinkCount());
if (isEmpty() || contains("outboundlinksnoindexcount_i")) addSolr(solrdoc, "outboundlinksnoindexcount_i", yacydoc.outboundLinkNoindexCount());
if (isEmpty() || contains("attr_outboundlinks")) {
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? rel : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
">" +
((name.length() > 0) ? name : "") + "</a>";
c++;
}
addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
final String[] outboundlinksTag = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLProtocol = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksURLStub = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksName = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksRel = new String[yacydoc.outboundLinkCount()];
final String[] outboundlinksText = new String[yacydoc.outboundLinkCount()];
for (final MultiProtocolURI url: yacydoc.outboundLinks()) {
final Properties p = alllinks.get(url);
final String name = p.getProperty("name", ""); // the name attribute
final String rel = p.getProperty("rel", ""); // the rel-attribute
final String text = p.getProperty("text", ""); // the text between the <a></a> tag
final String urls = url.toNormalform(false, false);
final int pr = urls.indexOf("://");
outboundlinksURLProtocol[c] = urls.substring(0, pr);
outboundlinksURLStub[c] = urls.substring(pr + 3);
outboundlinksName[c] = name.length() > 0 ? name : "";
outboundlinksRel[c] = rel.length() > 0 ? rel : "";
outboundlinksText[c] = text.length() > 0 ? text : "";
outboundlinksTag[c] =
"<a href=\"" + url.toNormalform(false, false) + "\"" +
(rel.length() > 0 ? " rel=\"" + rel + "\"" : "") +
(name.length() > 0 ? " name=\"" + name + "\"" : "") +
">" +
((text.length() > 0) ? text : "") + "</a>";
c++;
}
if (isEmpty() || contains("attr_outboundlinks_tag")) addSolr(solrdoc, "attr_outboundlinks_tag", outboundlinksTag);
if (isEmpty() || contains("attr_outboundlinks_protocol")) addSolr(solrdoc, "attr_outboundlinks_protocol", outboundlinksURLProtocol);
if (isEmpty() || contains("attr_outboundlinks_urlstub")) addSolr(solrdoc, "attr_outboundlinks_urlstub", outboundlinksURLStub);
if (isEmpty() || contains("attr_outboundlinks_name")) addSolr(solrdoc, "attr_outboundlinks_name", outboundlinksName);
if (isEmpty() || contains("attr_outboundlinks_rel")) addSolr(solrdoc, "attr_outboundlinks_rel", outboundlinksRel);
if (isEmpty() || contains("attr_outboundlinks_text")) addSolr(solrdoc, "attr_outboundlinks_text", outboundlinksText);
// charset
addSolr(solrdoc, "charset_s", yacydoc.getCharset());
@ -255,27 +254,43 @@ public class SolrScheme extends ConfigurationSet {
if (li.length > 0) addSolr(solrdoc, "attr_li", li);
// images
if (isEmpty() || contains("attr_images")) {
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] images = new String[imagesc.size()];
c = 0;
for (final ImageEntry ie: imagesc) images[c++] = ie.toString();
addSolr(solrdoc, "imagescount_i", images.length);
if (images.length > 0) addSolr(solrdoc, "attr_images", images);
final Collection<ImageEntry> imagesc = html.getImages().values();
final String[] imgtags = new String[imagesc.size()];
final String[] imgprots = new String[imagesc.size()];
final String[] imgstubs = new String[imagesc.size()];
final String[] imgalts = new String[imagesc.size()];
c = 0;
for (final ImageEntry ie: imagesc) {
final MultiProtocolURI uri = ie.url();
imgtags[c] = ie.toString();
imgprots[c] = uri.getProtocol();
imgstubs[c] = uri.toString().substring(imgprots[c].length() + 3);
imgalts[c] = ie.alt();
c++;
}
addSolr(solrdoc, "imagescount_i", imgtags.length);
if (isEmpty() || contains("attr_images_tag")) addSolr(solrdoc, "attr_images_tag", imgtags);
if (isEmpty() || contains("attr_images_protocol")) addSolr(solrdoc, "attr_images_protocol", imgprots);
if (isEmpty() || contains("attr_images_urlstub")) addSolr(solrdoc, "attr_images_urlstub", imgstubs);
if (isEmpty() || contains("attr_images_alt")) addSolr(solrdoc, "attr_images_alt", imgalts);
// style sheets
if (isEmpty() || contains("attr_css")) {
final Map<MultiProtocolURI, String> csss = html.getCSS();
final String[] css = new String[csss.size()];
final String[] css_tag = new String[csss.size()];
final String[] css_url = new String[csss.size()];
c = 0;
for (final Map.Entry<MultiProtocolURI, String> entry: csss.entrySet()) {
css[c++] =
final String url = entry.getKey().toNormalform(false, false, false, false);
css_tag[c] =
"<link rel=\"stylesheet\" type=\"text/css\" media=\"" + entry.getValue() + "\"" +
" href=\""+ entry.getKey().toNormalform(false, false, false, false) + "\" />";
" href=\""+ url + "\" />";
css_url[c] = url;
c++;
}
addSolr(solrdoc, "csscount_i", css.length);
if (css.length > 0) addSolr(solrdoc, "attr_css", css);
addSolr(solrdoc, "csscount_i", css_tag.length);
if (css_tag.length > 0) addSolr(solrdoc, "attr_css_tag", css_tag);
if (css_url.length > 0) addSolr(solrdoc, "attr_css_url", css_url);
}
// Scripts

@ -447,6 +447,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
this.title = recursiveParse(text);
this.evaluationScores.match(Element.title, this.title);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.bold.inc(h);

@ -62,6 +62,7 @@ public class Evaluation {
public static enum Element {
text,
title,
bodyclass,
divid,
csspath,

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -34,8 +34,6 @@ import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.UnsupportedCharsetException;
import java.util.regex.Pattern;
import com.ibm.icu.text.CharsetDetector;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.document.AbstractParser;
@ -47,47 +45,49 @@ import net.yacy.document.parser.html.ScraperInputStream;
import net.yacy.document.parser.html.TransformerWriter;
import net.yacy.kelondro.util.FileUtils;
import com.ibm.icu.text.CharsetDetector;
public class htmlParser extends AbstractParser implements Parser {
private static final Pattern patternUnderline = Pattern.compile("_");
public htmlParser() {
super("HTML Parser");
SUPPORTED_EXTENSIONS.add("htm");
SUPPORTED_EXTENSIONS.add("html");
SUPPORTED_EXTENSIONS.add("phtml");
SUPPORTED_EXTENSIONS.add("shtml");
SUPPORTED_EXTENSIONS.add("xhtml");
SUPPORTED_EXTENSIONS.add("php");
SUPPORTED_EXTENSIONS.add("php3");
SUPPORTED_EXTENSIONS.add("php4");
SUPPORTED_EXTENSIONS.add("php5");
SUPPORTED_EXTENSIONS.add("cfm");
SUPPORTED_EXTENSIONS.add("asp");
SUPPORTED_EXTENSIONS.add("aspx");
SUPPORTED_EXTENSIONS.add("tex");
SUPPORTED_EXTENSIONS.add("txt");
super("HTML Parser");
this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("phtml");
this.SUPPORTED_EXTENSIONS.add("shtml");
this.SUPPORTED_EXTENSIONS.add("xhtml");
this.SUPPORTED_EXTENSIONS.add("php");
this.SUPPORTED_EXTENSIONS.add("php3");
this.SUPPORTED_EXTENSIONS.add("php4");
this.SUPPORTED_EXTENSIONS.add("php5");
this.SUPPORTED_EXTENSIONS.add("cfm");
this.SUPPORTED_EXTENSIONS.add("asp");
this.SUPPORTED_EXTENSIONS.add("aspx");
this.SUPPORTED_EXTENSIONS.add("tex");
this.SUPPORTED_EXTENSIONS.add("txt");
//SUPPORTED_EXTENSIONS.add("js");
SUPPORTED_EXTENSIONS.add("jsp");
SUPPORTED_EXTENSIONS.add("mf");
SUPPORTED_EXTENSIONS.add("pl");
SUPPORTED_EXTENSIONS.add("py");
SUPPORTED_MIME_TYPES.add("text/html");
SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
SUPPORTED_MIME_TYPES.add("application/x-tex");
SUPPORTED_MIME_TYPES.add("text/plain");
SUPPORTED_MIME_TYPES.add("text/sgml");
SUPPORTED_MIME_TYPES.add("text/csv");
this.SUPPORTED_EXTENSIONS.add("jsp");
this.SUPPORTED_EXTENSIONS.add("mf");
this.SUPPORTED_EXTENSIONS.add("pl");
this.SUPPORTED_EXTENSIONS.add("py");
this.SUPPORTED_MIME_TYPES.add("text/html");
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
this.SUPPORTED_MIME_TYPES.add("application/x-httpd-php");
this.SUPPORTED_MIME_TYPES.add("application/x-tex");
this.SUPPORTED_MIME_TYPES.add("text/plain");
this.SUPPORTED_MIME_TYPES.add("text/sgml");
this.SUPPORTED_MIME_TYPES.add("text/csv");
}
public static ContentScraper parseToScraper(
final MultiProtocolURI location,
final String documentCharset,
final MultiProtocolURI location,
final String documentCharset,
InputStream sourceStream) throws Parser.Failure, IOException {
// make a scraper
String charset = null;
@ -95,72 +95,72 @@ public class htmlParser extends AbstractParser implements Parser {
if (documentCharset != null) {
charset = patchCharsetEncoding(documentCharset);
}
// nothing found: try to find a meta-tag
if (charset == null) {
try {
final ScraperInputStream htmlFilter = new ScraperInputStream(sourceStream,documentCharset,location,null,false);
sourceStream = htmlFilter;
charset = htmlFilter.detectCharset();
} catch (IOException e1) {
} catch (final IOException e1) {
throw new Parser.Failure("Charset error:" + e1.getMessage(), location);
}
}
// the author didn't tell us the encoding, try the mozilla-heuristic
if (charset == null) {
CharsetDetector det = new CharsetDetector();
final CharsetDetector det = new CharsetDetector();
det.enableInputFilter(true);
InputStream detStream = new BufferedInputStream(sourceStream);
final InputStream detStream = new BufferedInputStream(sourceStream);
det.setText(detStream);
charset = det.detect().getName();
sourceStream = detStream;
}
// wtf? still nothing, just take system-standard
if (charset == null) {
charset = Charset.defaultCharset().name();
}
Charset c;
try {
c = Charset.forName(charset);
} catch (IllegalCharsetNameException e) {
} catch (final IllegalCharsetNameException e) {
c = Charset.defaultCharset();
} catch (UnsupportedCharsetException e) {
} catch (final UnsupportedCharsetException e) {
c = Charset.defaultCharset();
}
// parsing the content
final ContentScraper scraper = new ContentScraper(location);
final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false);
try {
FileUtils.copy(sourceStream, writer, c);
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure("IO error:" + e.getMessage(), location);
} finally {
sourceStream.close();
writer.close();
}
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//OutputStream hfos = new htmlFilterOutputStream(null, scraper, null, false);
//serverFileUtils.copy(sourceFile, hfos);
//hfos.close();
if (writer.binarySuspect()) {
final String errorMsg = "Binary data found in resource";
throw new Parser.Failure(errorMsg, location);
throw new Parser.Failure(errorMsg, location);
}
return scraper;
}
public Document[] parse(
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final MultiProtocolURI location,
final String mimeType,
final String documentCharset,
final InputStream sourceStream) throws Parser.Failure, InterruptedException {
try {
return transformScraper(location, mimeType, documentCharset, parseToScraper(location, documentCharset, sourceStream));
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
}
}
@ -197,7 +197,7 @@ public class htmlParser extends AbstractParser implements Parser {
scraper.getRSS(),
scraper.getImages(),
scraper.indexingDenied())};
//scraper.close();
//scraper.close();
for (final Document ppd: ppds) {
ppd.setFavicon(scraper.getFavicon());
}
@ -214,10 +214,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @return patched encoding name
*/
public static String patchCharsetEncoding(String encoding) {
// do nothing with null
if ((encoding == null) || (encoding.length() < 3)) return null;
// trim encoding string
encoding = encoding.trim();
@ -228,7 +228,7 @@ public class htmlParser extends AbstractParser implements Parser {
// all other names but such with "windows" use uppercase
if (encoding.startsWith("WINDOWS")) encoding = "windows" + encoding.substring(7);
if (encoding.startsWith("MACINTOSH")) encoding = "MacRoman";
// fix wrong fill characters
encoding = patternUnderline.matcher(encoding).replaceAll("-");
@ -236,7 +236,7 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.matches(".*UTF[-_]?8.*")) return "UTF-8";
if (encoding.startsWith("US")) return "US-ASCII";
if (encoding.startsWith("KOI")) return "KOI8-R";
// patch missing '-'
if (encoding.startsWith("windows") && encoding.length() > 7) {
final char c = encoding.charAt(7);
@ -244,7 +244,7 @@ public class htmlParser extends AbstractParser implements Parser {
encoding = "windows-" + encoding.substring(7);
}
}
if (encoding.startsWith("ISO")) {
// patch typos
if (encoding.length() > 3) {
@ -256,11 +256,11 @@ public class htmlParser extends AbstractParser implements Parser {
if (encoding.length() > 8) {
final char c = encoding.charAt(8);
if ((c >= '0') && (c <= '9')) {
encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
}
encoding = encoding.substring(0, 8) + "-" + encoding.substring(8);
}
}
}
// patch wrong name
if (encoding.startsWith("ISO-8559")) {
// popular typo
@ -279,26 +279,26 @@ public class htmlParser extends AbstractParser implements Parser {
return encoding;
}
public static void main(String[] args) {
public static void main(final String[] args) {
// test parsing of a url
MultiProtocolURI url;
try {
url = new MultiProtocolURI(args[0]);
byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
String title = document[0].dc_title();
final byte[] content = url.get(ClientIdentification.getUserAgent(), 3000);
final Document[] document = new htmlParser().parse(url, "text/html", null, new ByteArrayInputStream(content));
final String title = document[0].dc_title();
System.out.println(title);
System.out.println(CharacterCoding.unicode2html(title, false));
} catch (MalformedURLException e) {
} catch (final MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
} catch (final IOException e) {
e.printStackTrace();
} catch (Parser.Failure e) {
} catch (final Parser.Failure e) {
e.printStackTrace();
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
e.printStackTrace();
}
}
}

@ -147,7 +147,7 @@ public final class LoaderDispatcher {
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
public Response load(final Request request, final CacheStrategy cacheStrategy, final boolean checkBlacklist) throws IOException {
return load(request, cacheStrategy, protocolMaxFileSize(request.url()), checkBlacklist);
}
@ -274,7 +274,7 @@ public final class LoaderDispatcher {
if (response != null && response.getContent() != null) {
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
if (!crawlProfile.storeHTCache()) {
if (crawlProfile == null || !crawlProfile.storeHTCache()) {
// no caching wanted. Thats ok, do not write any message
return response;
}
@ -294,7 +294,7 @@ public final class LoaderDispatcher {
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
private int protocolMaxFileSize(final DigestURI url) {
if (url.isHTTP() || url.isHTTPS())
return this.sb.getConfigInt("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);

@ -124,6 +124,8 @@ then
then
ENABLEHUGEPAGES=1
fi
# the G1 GC is on by default in Java7, so we try that here as well
# JAVA_ARGS="$JAVA_ARGS -XX:+UnlockExperimentalVMOptions -XX:+UseG1GC"
elif [ $OS = "SunOS" ]
then
# the UseConcMarkSweepGC option caused a full CPU usage - bug on Darwin.

Loading…
Cancel
Save