added pull request from als plus an NPE fix

pull/1/head
Michael Christen 13 years ago
commit 9cd469e6d6

@ -1,17 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<classpath> <classpath>
<classpathentry excluding="env/|htdocsdefault/|proxymsg/|yacy/|env/|yacy/user/|yacy/user/|yacy/ui/|processing/domaingraph/applet/|processing/domaingraph/|api/|api/bookmarks/posts/|api/bookmarks/|api/util/|api/bookmarks/xbel/|api/bookmarks/tags/|api/ymarks/" kind="src" path="htroot"/>
<classpathentry kind="src" path="test"/>
<classpathentry excluding="user/|user/|ui/" kind="src" path="htroot/yacy"/>
<classpathentry kind="src" path="htroot/env"/>
<classpathentry kind="src" path="source"/> <classpathentry kind="src" path="source"/>
<classpathentry kind="src" path="htroot/yacy/ui"/>
<classpathentry excluding="bookmarks/posts/|bookmarks/|util/|bookmarks/xbel/|bookmarks/tags/|ymarks/" kind="src" path="htroot/api"/>
<classpathentry kind="src" path="htroot/api/bookmarks/posts"/>
<classpathentry excluding="posts/|xbel/|tags/" kind="src" path="htroot/api/bookmarks"/>
<classpathentry kind="src" path="htroot/api/bookmarks/xbel"/>
<classpathentry kind="src" path="htroot/api/bookmarks/tags"/>
<classpathentry kind="src" path="htroot/api/ymarks"/>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
<classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/> <classpathentry kind="lib" path="lib/commons-logging-1.1.1.jar"/>
<classpathentry kind="lib" path="lib/servlet-api.jar"/> <classpathentry kind="lib" path="lib/servlet-api.jar"/>

@ -7,7 +7,7 @@ releaseVersion=1.0
stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz stdReleaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz sourceReleaseFile=yacy_src_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseFileParentDir=yacy releaseFileParentDir=yacy
releaseNr=$Revision: 8134 $ releaseNr=$Revision: 8135 $
privateKeyFile=private.key privateKeyFile=private.key
# defining some file/directory access rights # defining some file/directory access rights

@ -27,6 +27,9 @@ import de.anomic.crawler.retrieval.Response;
public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler { public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandler {
private static final String EMPTY_STRING = new String();
public final static String SPACE = " "; public final static String SPACE = " ";
public final static String POISON = ""; public final static String POISON = "";
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo", public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
@ -83,7 +86,9 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>(); final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
StringBuilder token; StringBuilder token;
if(document != null) { if(document == null) {
return EMPTY_STRING;
}
//get words from document //get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
@ -95,7 +100,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
buffer.append(document.dc_title().toLowerCase()); buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase()); buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase());
final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
try {
int score = 0; int score = 0;
// get phrases // get phrases
@ -163,14 +169,16 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return document.getFileExtension(); return document.getFileExtension();
} }
return clean; return clean;
} finally {
tokens.close();
} }
return new String();
} }
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) { private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>(); final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128); final StringBuilder phrase = new StringBuilder(128);
final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib); final WordTokenizer tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
try {
StringBuilder token; StringBuilder token;
int count = 0; int count = 0;
@ -201,6 +209,9 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
} }
return phrases; return phrases;
} finally {
tokens.close();
}
} }
public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) { public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {

@ -235,8 +235,8 @@ public final class Condenser {
if (text == null) return; if (text == null) return;
String word; String word;
Word wprop; Word wprop;
WordTokenizer wordenum; WordTokenizer wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib);
wordenum = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(text)), meaningLib); try {
int pip = 0; int pip = 0;
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH); word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
@ -251,6 +251,9 @@ public final class Condenser {
this.RESULT_NUMB_WORDS++; this.RESULT_NUMB_WORDS++;
this.RESULT_DIFF_WORDS++; this.RESULT_DIFF_WORDS++;
} }
} finally {
wordenum.close();
}
} }
public Condenser(final InputStream text, final WordCache meaningLib) { public Condenser(final InputStream text, final WordCache meaningLib) {
@ -296,6 +299,7 @@ public final class Condenser {
// read source // read source
final WordTokenizer wordenum = new WordTokenizer(is, meaningLib); final WordTokenizer wordenum = new WordTokenizer(is, meaningLib);
try {
while (wordenum.hasMoreElements()) { while (wordenum.hasMoreElements()) {
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
if (this.languageIdentificator != null) this.languageIdentificator.add(word); if (this.languageIdentificator != null) this.languageIdentificator.add(word);
@ -336,6 +340,9 @@ public final class Condenser {
wordInSentenceCounter++; wordInSentenceCounter++;
} }
} }
} finally {
wordenum.close();
}
if (pseudostemming) { if (pseudostemming) {
Map.Entry<String, Word> entry; Map.Entry<String, Word> entry;

@ -141,4 +141,12 @@ public class SentenceReader implements Iterator<StringBuilder> {
public void remove() { public void remove() {
throw new UnsupportedOperationException(); throw new UnsupportedOperationException();
} }
public void close() {
try {
raf.close();
} catch(IOException ioe) {
// Ignore IO Exceptions
}
}
} }

@ -280,14 +280,21 @@ public final class TextParser {
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>(); final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 6, false)) { if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) { for (final Parser parser: parsers) {
ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray);
try { try {
docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray)); docs = parser.parse(location, mimeType, documentCharset, bis);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
failedParser.put(parser, e); failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} catch (final Exception e) { } catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location)); failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e); //log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} finally {
try {
bis.close();
} catch(IOException ioe) {
// Ignore.
}
} }
if (docs != null) break; if (docs != null) break;
} }

@ -80,6 +80,10 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return r; return r;
} }
public void close() {
e.close();
}
private static class unsievedWordsEnum implements Enumeration<StringBuilder> { private static class unsievedWordsEnum implements Enumeration<StringBuilder> {
// returns an enumeration of StringBuilder Objects // returns an enumeration of StringBuilder Objects
private StringBuilder buffer = null; private StringBuilder buffer = null;
@ -145,6 +149,9 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
return r; return r;
} }
public void close() {
e.close();
}
} }
public static StringBuilder trim(final StringBuilder sb) { public static StringBuilder trim(final StringBuilder sb) {
@ -172,7 +179,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
*/ */
public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) { public static SortedMap<byte[], Integer> hashSentence(final String sentence, final WordCache meaningLib) {
final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder); final SortedMap<byte[], Integer> map = new TreeMap<byte[], Integer>(Base64Order.enhancedCoder);
final Enumeration<StringBuilder> words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib); final WordTokenizer words = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(sentence)), meaningLib);
try {
int pos = 0; int pos = 0;
StringBuilder word; StringBuilder word;
byte[] hash; byte[] hash;
@ -190,5 +198,8 @@ public class WordTokenizer implements Enumeration<StringBuilder> {
pos += word.length() + 1; pos += word.length() + 1;
} }
return map; return map;
} finally {
words.close();
}
} }
} }

@ -58,6 +58,7 @@ import net.yacy.kelondro.util.MemoryControl;
public class ContentScraper extends AbstractScraper implements Scraper { public class ContentScraper extends AbstractScraper implements Scraper {
private static final String EMPTY_STRING = new String();
private final char degree = '\u00B0'; private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray(); private final char[] minuteCharsHTML = "&#039;".toCharArray();
@ -159,7 +160,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.iframes = new HashSet<MultiProtocolURI>(); this.iframes = new HashSet<MultiProtocolURI>();
this.metas = new HashMap<String, String>(); this.metas = new HashMap<String, String>();
this.script = new HashSet<MultiProtocolURI>(); this.script = new HashSet<MultiProtocolURI>();
this.title = ""; this.title = EMPTY_STRING;
this.headlines = new ArrayList[6]; this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>(); for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>(); this.bold = new ClusteredScoreMap<String>();
@ -318,14 +319,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag0(final String tagname, final Properties tagopts) { public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) { if (tagname.equalsIgnoreCase("img")) {
final String src = tagopts.getProperty("src", ""); final String src = tagopts.getProperty("src", EMPTY_STRING);
try { try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1")); final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1")); final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
if (src.length() > 0) { if (src.length() > 0) {
final MultiProtocolURI url = absolutePath(src); final MultiProtocolURI url = absolutePath(src);
if (url != null) { if (url != null) {
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1); final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", EMPTY_STRING), width, height, -1);
addImage(this.images, ie); addImage(this.images, ie);
} }
} }
@ -333,47 +334,47 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.evaluationScores.match(Element.imgpath, src); this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) { } else if(tagname.equalsIgnoreCase("base")) {
try { try {
this.root = new MultiProtocolURI(tagopts.getProperty("href", "")); this.root = new MultiProtocolURI(tagopts.getProperty("href", EMPTY_STRING));
} catch (final MalformedURLException e) {} } catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) { } else if (tagname.equalsIgnoreCase("frame")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
mergeAnchors(src, tagopts /* with property "name" */); mergeAnchors(src, tagopts /* with property "name" */);
this.frames.add(src); this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false)); this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("body")) { } else if (tagname.equalsIgnoreCase("body")) {
final String c = tagopts.getProperty("class", ""); final String c = tagopts.getProperty("class", EMPTY_STRING);
this.evaluationScores.match(Element.bodyclass, c); this.evaluationScores.match(Element.bodyclass, c);
} else if (tagname.equalsIgnoreCase("div")) { } else if (tagname.equalsIgnoreCase("div")) {
final String id = tagopts.getProperty("id", ""); final String id = tagopts.getProperty("id", EMPTY_STRING);
this.evaluationScores.match(Element.divid, id); this.evaluationScores.match(Element.divid, id);
} else if (tagname.equalsIgnoreCase("meta")) { } else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", ""); String name = tagopts.getProperty("name", EMPTY_STRING);
final String content = tagopts.getProperty("content",""); final String content = tagopts.getProperty("content", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.equals("generator")) { if (name.equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content); this.evaluationScores.match(Element.metagenerator, content);
} }
} else { } else {
name = tagopts.getProperty("http-equiv", ""); name = tagopts.getProperty("http-equiv", EMPTY_STRING);
if (name.length() > 0) { if (name.length() > 0) {
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content)); this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
} }
} }
} else if (tagname.equalsIgnoreCase("area")) { } else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title","")); final String areatitle = cleanLine(tagopts.getProperty("title",EMPTY_STRING));
//String alt = tagopts.getProperty("alt",""); //String alt = tagopts.getProperty("alt",EMPTY_STRING);
final String href = tagopts.getProperty("href", ""); final String href = tagopts.getProperty("href", EMPTY_STRING);
tagopts.put("nme", areatitle); tagopts.put("nme", areatitle);
if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts); if (href.length() > 0) mergeAnchors(absolutePath(href), tagopts);
} else if (tagname.equalsIgnoreCase("link")) { } else if (tagname.equalsIgnoreCase("link")) {
final String href = tagopts.getProperty("href", ""); final String href = tagopts.getProperty("href", EMPTY_STRING);
final MultiProtocolURI newLink = absolutePath(href); final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) { if (newLink != null) {
final String rel = tagopts.getProperty("rel", ""); final String rel = tagopts.getProperty("rel", EMPTY_STRING);
final String linktitle = tagopts.getProperty("title", ""); final String linktitle = tagopts.getProperty("title", EMPTY_STRING);
final String type = tagopts.getProperty("type", ""); final String type = tagopts.getProperty("type", EMPTY_STRING);
if (rel.equalsIgnoreCase("shortcut icon")) { if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1); final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
@ -394,11 +395,11 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
} }
} else if(tagname.equalsIgnoreCase("embed")) { } else if(tagname.equalsIgnoreCase("embed")) {
mergeAnchors(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */); mergeAnchors(absolutePath(tagopts.getProperty("src", EMPTY_STRING)), tagopts /* with property "name" */);
} else if(tagname.equalsIgnoreCase("param")) { } else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", ""); final String name = tagopts.getProperty("name", EMPTY_STRING);
if (name.equalsIgnoreCase("movie")) { if (name.equalsIgnoreCase("movie")) {
mergeAnchors(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */); mergeAnchors(absolutePath(tagopts.getProperty("value", EMPTY_STRING)), tagopts /* with property "name" */);
} }
} }
@ -409,12 +410,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) { public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text)); // System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) { if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
final String href = tagopts.getProperty("href", ""); final String href = tagopts.getProperty("href", EMPTY_STRING);
MultiProtocolURI url; MultiProtocolURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFile(); final String f = url.getFile();
final int p = f.lastIndexOf('.'); final int p = f.lastIndexOf('.');
final String type = (p < 0) ? "" : f.substring(p + 1); final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) { if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
// special handling of such urls: put them to the image urls // special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1); final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
@ -461,12 +462,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = recursiveParse(text); h = recursiveParse(text);
if (h.length() > 0) this.li.add(h); if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) { } else if (tagname.equalsIgnoreCase("iframe")) {
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", "")); final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", EMPTY_STRING));
mergeAnchors(src, tagopts /* with property "name" */); mergeAnchors(src, tagopts /* with property "name" */);
this.iframes.add(src); this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false)); this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) { } else if (tagname.equalsIgnoreCase("script")) {
final String src = tagopts.getProperty("src", ""); final String src = tagopts.getProperty("src", EMPTY_STRING);
if (src.length() > 0) { if (src.length() > 0) {
this.script.add(absolutePath(src)); this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src); this.evaluationScores.match(Element.scriptpath, src);
@ -507,7 +508,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} }
private final static String cleanLine(final String s) { private final static String cleanLine(final String s) {
if (!MemoryControl.request(s.length() * 2, false)) return ""; if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING;
final StringBuilder sb = new StringBuilder(s.length()); final StringBuilder sb = new StringBuilder(s.length());
char l = ' '; char l = ' ';
char c; char c;
@ -683,27 +684,27 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String getDescription() { public String getDescription() {
String s = this.metas.get("description"); String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description"); if (s == null) s = this.metas.get("dc.description");
if (s == null) return ""; if (s == null) return EMPTY_STRING;
return s; return s;
} }
public String getContentType() { public String getContentType() {
final String s = this.metas.get("content-type"); final String s = this.metas.get("content-type");
if (s == null) return ""; if (s == null) return EMPTY_STRING;
return s; return s;
} }
public String getAuthor() { public String getAuthor() {
String s = this.metas.get("author"); String s = this.metas.get("author");
if (s == null) s = this.metas.get("dc.creator"); if (s == null) s = this.metas.get("dc.creator");
if (s == null) return ""; if (s == null) return EMPTY_STRING;
return s; return s;
} }
public String getPublisher() { public String getPublisher() {
String s = this.metas.get("copyright"); String s = this.metas.get("copyright");
if (s == null) s = this.metas.get("dc.publisher"); if (s == null) s = this.metas.get("dc.publisher");
if (s == null) return ""; if (s == null) return EMPTY_STRING;
return s; return s;
} }
@ -732,7 +733,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String[] getKeywords() { public String[] getKeywords() {
String s = this.metas.get("keywords"); String s = this.metas.get("keywords");
if (s == null) s = this.metas.get("dc.description"); if (s == null) s = this.metas.get("dc.description");
if (s == null) s = ""; if (s == null) s = EMPTY_STRING;
if (s.length() == 0) { if (s.length() == 0) {
return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase()); return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase());
} }
@ -756,13 +757,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public String getRefreshPath() { public String getRefreshPath() {
String s = this.metas.get("refresh"); String s = this.metas.get("refresh");
if (s == null) return ""; if (s == null) return EMPTY_STRING;
final int pos = s.indexOf(';'); final int pos = s.indexOf(';');
if (pos < 0) return ""; if (pos < 0) return EMPTY_STRING;
s = s.substring(pos + 1); s = s.substring(pos + 1);
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim(); if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim();
return ""; return EMPTY_STRING;
} }
// parse location // parse location

@ -81,11 +81,22 @@ public final class TransformerWriter extends Writer {
final Scraper scraper, final Scraper scraper,
final Transformer transformer, final Transformer transformer,
final boolean passbyIfBinarySuspect final boolean passbyIfBinarySuspect
) {
this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 1024);
}
public TransformerWriter(
final OutputStream outStream,
final Charset charSet,
final Scraper scraper,
final Transformer transformer,
final boolean passbyIfBinarySuspect,
final int initialBufferSize
) { ) {
this.outStream = outStream; this.outStream = outStream;
this.scraper = scraper; this.scraper = scraper;
this.transformer = transformer; this.transformer = transformer;
this.buffer = new CharBuffer(1024); this.buffer = new CharBuffer(initialBufferSize);
this.filterTag = null; this.filterTag = null;
this.filterOpts = null; this.filterOpts = null;
this.filterCont = null; this.filterCont = null;
@ -540,6 +551,7 @@ public final class TransformerWriter extends Writer {
final char[] filtered = filterSentence(this.buffer.getChars(), quotechar); final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
if (this.out != null) this.out.write(filtered); if (this.out != null) this.out.write(filtered);
} }
this.buffer.close();
this.buffer = null; this.buffer = null;
} }
final char[] finalized = filterFinalize(quotechar); final char[] finalized = filterFinalize(quotechar);
@ -550,6 +562,7 @@ public final class TransformerWriter extends Writer {
} }
this.filterTag = null; this.filterTag = null;
this.filterOpts = null; this.filterOpts = null;
if (this.filterCont != null) this.filterCont.close();
this.filterCont = null; this.filterCont = null;
// if (scraper != null) {scraper.close(); scraper = null;} // if (scraper != null) {scraper.close(); scraper = null;}
// if (transformer != null) {transformer.close(); transformer = null;} // if (transformer != null) {transformer.close(); transformer = null;}

@ -196,7 +196,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content // parsing the content
final ContentScraper scraper = new ContentScraper(location); final ContentScraper scraper = new ContentScraper(location);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false); final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, sourceStream.available());
try { try {
FileUtils.copy(sourceStream, writer, c); FileUtils.copy(sourceStream, writer, c);
} catch (final IOException e) { } catch (final IOException e) {

@ -114,17 +114,20 @@ public class odtParser extends AbstractParser implements Parser {
if (entryName.equals("content.xml")) { if (entryName.equals("content.xml")) {
// create a writer for output // create a writer for output
writer = new CharBuffer(); writer = new CharBuffer((int)zipEntry.getSize());
try {
// extract data // extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser(); final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
} finally {
// close readers and writers // close readers and writers
zipFileEntryStream.close(); zipFileEntryStream.close();
}
} finally {
writer.close(); writer.close();
}
} else if (entryName.equals("meta.xml")) { } else if (entryName.equals("meta.xml")) {
// meta.xml contains metadata about the document // meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -100,17 +100,21 @@ public class ooxmlParser extends AbstractParser implements Parser {
|| entryName.startsWith("xl/worksheets/sheet")) { || entryName.startsWith("xl/worksheets/sheet")) {
// create a writer for output // create a writer for output
writer = new CharBuffer(); writer = new CharBuffer((int)zipEntry.getSize());
try {
// extract data // extract data
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);
try {
final SAXParser saxParser = saxParserFactory.newSAXParser(); final SAXParser saxParser = saxParserFactory.newSAXParser();
saxParser.parse(zipFileEntryStream, new ODContentHandler(writer)); saxParser.parse(zipFileEntryStream, new ODContentHandler(writer));
// close readers and writers // close readers and writers
} finally {
zipFileEntryStream.close(); zipFileEntryStream.close();
}
} finally {
writer.close(); writer.close();
}
} else if (entryName.equals("docProps/core.xml")) { } else if (entryName.equals("docProps/core.xml")) {
// meta.xml contains metadata about the document // meta.xml contains metadata about the document
final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry); final InputStream zipFileEntryStream = zipFile.getInputStream(zipEntry);

@ -898,8 +898,7 @@ public class ArrayStack implements BLOB {
if (!i1.hasNext()) { if (!i1.hasNext()) {
if (i2.hasNext()) { if (i2.hasNext()) {
HeapWriter.delete(f1); HeapWriter.delete(f1);
if (f2.renameTo(newFile)) if (f2.renameTo(newFile)) return newFile;
return newFile;
return f2; return f2;
} }
HeapWriter.delete(f1); HeapWriter.delete(f1);
@ -907,8 +906,7 @@ public class ArrayStack implements BLOB {
return null; return null;
} else if (!i2.hasNext()) { } else if (!i2.hasNext()) {
HeapWriter.delete(f2); HeapWriter.delete(f2);
if (f1.renameTo(newFile)) if (f1.renameTo(newFile)) return newFile;
return newFile;
return f1; return f1;
} }
assert i1.hasNext(); assert i1.hasNext();

@ -48,6 +48,8 @@ import net.yacy.kelondro.util.kelondroException;
public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>, Cloneable { public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>, Cloneable {
private static final byte[] EMPTY_CACHE = new byte[0];
public static final long growfactorLarge100 = 140L; public static final long growfactorLarge100 = 140L;
public static final long growfactorSmall100 = 120L; public static final long growfactorSmall100 = 120L;
private static final int isortlimit = 20; private static final int isortlimit = 20;
@ -77,7 +79,7 @@ public class RowCollection implements Sortable<Row.Entry>, Iterable<Row.Entry>,
this.rowdef = rowdef; this.rowdef = rowdef;
this.sortBound = 0; this.sortBound = 0;
this.lastTimeWrote = System.currentTimeMillis(); this.lastTimeWrote = System.currentTimeMillis();
this.chunkcache = new byte[0]; this.chunkcache = EMPTY_CACHE;
this.chunkcount = 0; this.chunkcount = 0;
} }

@ -122,9 +122,9 @@ public final class CharBuffer extends Writer {
return length; return length;
} }
private void grow() { private void grow(int minSize) {
int newsize = buffer.length * 2 + 1; int newsize = buffer.length + 1024;
if (newsize < 32) newsize = 32; if (newsize < minSize) newsize = minSize+1;
char[] tmp = new char[newsize]; char[] tmp = new char[newsize];
System.arraycopy(buffer, offset, tmp, 0, length); System.arraycopy(buffer, offset, tmp, 0, length);
buffer = tmp; buffer = tmp;
@ -136,7 +136,7 @@ public final class CharBuffer extends Writer {
} }
public void write(final char b) { public void write(final char b) {
if (offset + length + 1 > buffer.length) grow(); if (offset + length + 1 > buffer.length) grow(offset + length + 1);
buffer[offset + length++] = b; buffer[offset + length++] = b;
} }
@ -145,7 +145,7 @@ public final class CharBuffer extends Writer {
} }
public void write(final char[] bb, final int of, final int le) { public void write(final char[] bb, final int of, final int le) {
while (offset + length + le > buffer.length) grow(); if (offset + length + le > buffer.length) grow(offset + length + le);
System.arraycopy(bb, of, buffer, offset + length, le); System.arraycopy(bb, of, buffer, offset + length, le);
length += le; length += le;
} }
@ -476,7 +476,7 @@ public final class CharBuffer extends Writer {
} }
public void close() throws IOException { public void close() throws IOException {
// TODO Auto-generated method stub buffer = null; // assist with garbage collection
} }
public void flush() throws IOException { public void flush() throws IOException {

Loading…
Cancel
Save