diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java index 823868128..cf0ddcbfc 100644 --- a/source/de/anomic/plasma/plasmaCondenser.java +++ b/source/de/anomic/plasma/plasmaCondenser.java @@ -39,6 +39,8 @@ // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. +// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java +// execute with java -cp source de.anomic.plasma.plasmaCondenser package de.anomic.plasma; @@ -446,7 +448,6 @@ public class plasmaCondenser { c = r.charAt(i); if (!(((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9')))) continue loop; // go to next while loop - //if ((c < 'a') || (c > 'z')) continue loop; // go to next while loop } return s; } @@ -480,16 +481,21 @@ public class plasmaCondenser { private Object nextElement0() { String r; + StringBuffer sb; + char c; while (s.length() == 0) { if (e.hasMoreElements()) { - r = ((String) e.nextElement()).trim(); - s = ""; + r = (String) e.nextElement(); + if (r == null) return null; + r = r.trim(); + sb = new StringBuffer(r.length() * 2); for (int i = 0; i < r.length(); i++) { - if (invisible(r.charAt(i))) s = s + " "; - else if (punctuation(r.charAt(i))) s = s + " " + r.charAt(i) + " "; - else s = s + r.charAt(i); + c = r.charAt(i); + if (invisible(c)) sb = sb.append(' '); + else if (punctuation(c)) sb = sb.append(' ').append(c).append(' '); + else sb = sb.append(c); } - s = s.trim(); + s = sb.toString().trim(); //System.out.println("PARSING-LINE '" + r + "'->'" + s + "'"); } else { return null; @@ -636,5 +642,4 @@ public class plasmaCondenser { } } - } diff --git a/source/de/anomic/plasma/plasmaParser.java b/source/de/anomic/plasma/plasmaParser.java index d59a77ee9..ba197bf87 100644 --- a/source/de/anomic/plasma/plasmaParser.java +++ b/source/de/anomic/plasma/plasmaParser.java @@ -39,6 +39,8 @@ // the intact and unchanged copyright notice. // Contributions and changes to the program code must be marked as such. +// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java + package de.anomic.plasma; @@ -307,17 +309,23 @@ public final class plasmaParser { } public static void main(String[] args) { - try { - plasmaParser theParser = new plasmaParser(new File("yacy.parser")); - FileInputStream theInput = new FileInputStream(new File("Y:/public_html/test.pdf")); - ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); + //javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java + //java -cp source:lib/commons-collections.jar:lib/commons-pool-1.2.jar de.anomic.plasma.plasmaParser bug.html bug.out + try { + File in = new File(args[0]); + File out = new File(args[1]); + plasmaParser theParser = new plasmaParser(new File("yacy.parser")); + FileInputStream theInput = new FileInputStream(in); + ByteArrayOutputStream theOutput = new ByteArrayOutputStream(); serverFileUtils.copy(theInput, theOutput); - - theParser.parseSource(new URL("http://brain"),"application/pdf",theOutput.toByteArray()); + plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "text/html", theOutput.toByteArray()); + //plasmaParserDocument document = theParser.parseSource(new URL("http://brain.yacy"), "application/pdf", theOutput.toByteArray()); + byte[] theText = document.getText(); + serverFileUtils.write(theText, out); } catch (Exception e) { e.printStackTrace(); } - } + } } diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index 5684f7952..ebbaa9d8b 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -406,7 +406,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi // work off fresh entries from the proxy or from the crawler if (processStack.size() == 0) { - log.logDebug("DEQUEUE: queue is empty"); + //log.logDebug("DEQUEUE: queue is empty"); return false; // nothing to do } @@ -458,7 +458,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi public boolean localCrawlJob() { if (noticeURL.localStackSize() == 0) { - log.logDebug("LocalCrawl: queue is empty"); + //log.logDebug("LocalCrawl: queue is empty"); return false; } if (processStack.size() >= crawlSlots) { @@ -491,7 +491,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi // do nothing if either there are private processes to be done // or there is no global crawl on the stack if (noticeURL.remoteStackSize() == 0) { - log.logDebug("GlobalCrawl: queue is empty"); + //log.logDebug("GlobalCrawl: queue is empty"); return false; } if (processStack.size() > 0) {