Merge branch 'master' of https://github.com/yacy/yacy_search_server.git

9 years ago · 094aed8664
parent c7402a2f89 7c81160f45
commit 094aed8664
9 changed files with 71 additions and 18 deletions
--- a/htroot/BlacklistImpExp_p.html
+++ b/htroot/BlacklistImpExp_p.html
@ -80,7 +80,7 @@
            </fieldset>
        </form>

-        <form action="yacy/list.html" method="get" accept-charset="UTF-8">
+        <form action="api/blacklists_p.txt" method="get" accept-charset="UTF-8">
            <fieldset>
                <legend>plain text file:</legend>
                Here you can export a blacklist as a regular text file with one blacklist entry per line.
--- a/htroot/api/blacklists_p.txt
+++ b/htroot/api/blacklists_p.txt
@ -0,0 +1,4 @@
+#{lists}#
+#{items}##[item]#
+#{/items}#
+#{/lists}#
--- a/htroot/yacy/list.java
+++ b/htroot/yacy/list.java
@ -51,6 +51,7 @@ public final class list {

        // return variable that accumulates replacements
        final serverObjects prop = new serverObjects();
+        prop.put("list", ""); // init a empty return (error case)
        if ((post == null) || (env == null)) return prop;
        if (!Protocol.authentifyRequest(post, env)) return prop;

@ -66,7 +67,7 @@ public final class list {

        if ((sb.isRobinsonMode()) && (!sb.isInMyCluster(otherPeerName))) {
            // if we are a robinson cluster, answer only if this client is known by our network definition
-            return null;
+            return prop;
        }

        if (col.equals("black")) {
@ -85,8 +86,6 @@ public final class list {
            }

            prop.put("list",out.toString());
-        } else {
-            prop.put("list","");
        }

        return prop;
--- a/source/net/yacy/document/Condenser.java
+++ b/source/net/yacy/document/Condenser.java
@ -224,18 +224,19 @@ public final class Condenser extends Tokenizer {
        try {
 	        int pip = 0;
 	        while (wordenum.hasMoreElements()) {
-	            word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
-	            if (useForLanguageIdentification) this.languageIdentificator.add(word);
-	            if (word.length() < 2) continue;
+	            word = wordenum.nextElement().toString();
+	            if (useForLanguageIdentification) this.languageIdentificator.add(word); // langdetect is case sensitive
+                    if (word.length() < 2) continue;
+                    word = word.toLowerCase(Locale.ENGLISH);
 	            wprop = this.words.get(word);
 	            if (wprop == null) wprop = new Word(0, pip, phrase);
 	            if (wprop.flags == null) wprop.flags = flagstemplate.clone();
 	            wprop.flags.set(flagpos, true);
-	            this.words.put(word.toLowerCase(), wprop);
+	            this.words.put(word, wprop);
 	            pip++;
 	            this.RESULT_NUMB_WORDS++;
 	            //this.RESULT_DIFF_WORDS++;
-	        }
+                }
        } finally {
        	wordenum.close();
        	wordenum = null;
--- a/source/net/yacy/document/DateDetection.java
+++ b/source/net/yacy/document/DateDetection.java
@ -127,7 +127,7 @@ public class DateDetection {
    private final static Date TODAY = new Date();
    private final static int CURRENT_YEAR  = Integer.parseInt(CONFORM.format(TODAY).substring(0, 4)); // we need that to parse dates without given years, see the ShortStyle class

-    private final static String BODNCG = "(?:\\b|^)"; // begin of date non-capturing group
+    private final static String BODNCG = "(?:\\s|^)"; // begin of date non-capturing group
    private final static String EODNCG = "(?:[).:;! ]|$)"; // end of date non-capturing group
    private final static String SEPARATORNCG = "(?:/|-| - |\\.\\s|,\\s|\\.|,|\\s)"; // separator non-capturing group
    private final static String DAYCAPTURE = "(\\d{1,2})";
--- a/source/net/yacy/document/Tokenizer.java
+++ b/source/net/yacy/document/Tokenizer.java
@ -56,7 +56,7 @@ public class Tokenizer {
    public  static final int flag_cat_hasapp        = 23; // the page refers to (at least one) application file

    //private Properties analysis;
-    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation
+    protected final Map<String, Word> words; // a string (the words) to (indexWord) - relation (key: words are lowercase)
    private final Set<String> synonyms; // a set of synonyms to the words
    protected final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
    
@ -68,7 +68,6 @@ public class Tokenizer {
        this.words = new TreeMap<String, Word>(NaturalOrder.naturalComparator);
        this.synonyms = new LinkedHashSet<String>();
        assert text != null;
-        final Set<String> currsentwords = new HashSet<String>();
        String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
        for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
        String k;
@ -89,9 +88,9 @@ public class Tokenizer {
                // handle punktuation (start new sentence)
                if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) {
                    // store sentence
-                    currsentwords.clear();
+                    if (wordInSentenceCounter > 1) // if no word in sentence repeated punktuation ".....", don't count as sentence
+                        allsentencecounter++;
                    wordInSentenceCounter = 1;
-                    allsentencecounter++;
                    continue;
                }
                if (word.length() < wordminsize) continue;
@ -160,7 +159,6 @@ public class Tokenizer {

                // store word
                allwordcounter++;
-                currsentwords.add(word);
                Word wsp = this.words.get(word);
                if (wsp != null) {
                    // word already exists
@ -169,7 +167,7 @@ public class Tokenizer {
                    // word does not yet exist, create new word entry
                    wsp = new Word(allwordcounter, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 !
                    wsp.flags = this.RESULT_FLAGS.clone();
-                    this.words.put(word.toLowerCase(), wsp);
+                    this.words.put(word, wsp);
                }
                // we now have the unique handle of the word, put it into the sentence:
                wordInSentenceCounter++;
@ -214,9 +212,12 @@ public class Tokenizer {
        // store result
        this.RESULT_NUMB_WORDS = allwordcounter;
        // if text doesn't end with punktuation but has words after last found sentence, inc sentence count for trailing text.
-        this.RESULT_NUMB_SENTENCES = allsentencecounter + (currsentwords.size() > 0 ? 1 : 0);
+        this.RESULT_NUMB_SENTENCES = allsentencecounter + (wordInSentenceCounter > 1 ? 1 : 0);
    }
-    
+
+    /**
+     * @return returns the words as word/indexWord relation map. All words are lowercase.
+     */
    public Map<String, Word> words() {
        // returns the words as word/indexWord relation map
        return this.words;
--- a/source/net/yacy/document/language/Identificator.java
+++ b/source/net/yacy/document/language/Identificator.java
@ -50,6 +50,11 @@ public final class Identificator {
        }
    }

+    /**
+     * Append a word to the text to be analyzed.
+     * Analysis takes letter case into account (this means word should not be upper- or lower cased)
+     * @param word
+     */
    public void add(final String word) {
        if (word == null) return;
        this.detector.append(" " + word); // detector internally caches text up to maxtextlen = default = 10000 chars
--- a/test/java/net/yacy/document/DateDetectionTest.java
+++ b/test/java/net/yacy/document/DateDetectionTest.java
@ -28,6 +28,9 @@ public class DateDetectionTest {
        testtext.add("1.1.2016");
        testtext.add("1. Januar 2016");
        testtext.add("2016, January 1.");
+
+        testtext.add("beginning text 1.1.2016");
+        testtext.add("line break\n1.1.2016");
        for (String text : testtext) {
            Date d = DateDetection.parseLine(text, 0);

@ -82,4 +85,23 @@ public class DateDetectionTest {
        }
    }

+    /**
+     * Negative test of parseLine method, of class DateDetection
+     * with cases that represent NOT a date
+     */
+    @Test
+    public void testParseLineNoDate() {
+
+        // test input representations
+        Set<String> testtext = new LinkedHashSet();
+        testtext.add("3.1.2.0102"); // example of a program version string
+        // testtext.add("3.1.20.0102"); // date end-capture not working (on modification conflict with YMD parser)
+        testtext.add("v3.1.21");
+        testtext.add("v3.1.22.");
+
+        for (String text : testtext) {
+            Date d = DateDetection.parseLine(text, 0);
+            assertNull("not a date: " + text, d);
+        }
+    }
 }
--- a/test/java/net/yacy/document/TokenizerTest.java
+++ b/test/java/net/yacy/document/TokenizerTest.java
@ -2,7 +2,9 @@
 package net.yacy.document;

 import java.net.MalformedURLException;
+import java.util.HashSet;
 import java.util.Map;
+import java.util.Set;
 import net.yacy.cora.document.WordCache;
 import net.yacy.kelondro.data.word.Word;
 import org.junit.Test;
@ -36,4 +38,23 @@ public class TokenizerTest {
        assertEquals("occurence of 'words' ", 2, w.occurrences());
    }

+    /**
+     * Test of RESULT_NUMB_SENTENCES, of class Tokenizer.
+     */
+    @Test
+    public void testNumberOfSentences() {
+        Set<String> testText = new HashSet();
+        // text with 5 sentences
+        testText.add("Sentence One. Sentence Two. Comment on this. This is sentence four! Good By................");
+        testText.add("Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence w/o punktuation at end of text");
+        testText.add("!!! ! ! ! Sentence One. Sentence two. Sentence 3? Sentence 4! Sentence 5 ! ! ! !!!");
+
+        WordCache meaningLib = new WordCache(null);
+        boolean doAutotagging = false;
+        VocabularyScraper scraper = null;
+        for (String text : testText) {
+            Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper);
+            assertEquals("Tokenizer.RESULT_NUMB_SENTENCES", 5, t.RESULT_NUMB_SENTENCES);
+        }
+    }
 }