From 6811158e1a4b577e7e9fc21837c0b4f87f240439 Mon Sep 17 00:00:00 2001 From: luccioman Date: Sun, 31 Jul 2016 19:24:52 +0200 Subject: [PATCH 01/28] Expose HTTPS default port on docker images --- docker/Dockerfile | 4 ++-- docker/Dockerfile.alpine | 4 ++-- docker/docker-cloud.yml | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 35102e4d6..a090c75e8 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -36,8 +36,8 @@ RUN adduser --system --group --no-create-home --disabled-password yacy # Set ownership of yacy install directory to yacy user/group RUN chown yacy:yacy -R /opt/yacy_search_server -# Expose port 8090 -EXPOSE 8090 +# Expose HTTP and HTTPS default ports +EXPOSE 8090 8443 # Set data volume : yacy data and configuration will persist aven after container stop or destruction VOLUME ["/opt/yacy_search_server/DATA"] diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine index 497dd70ed..eb913af31 100755 --- a/docker/Dockerfile.alpine +++ b/docker/Dockerfile.alpine @@ -68,8 +68,8 @@ RUN addgroup yacy && adduser -S -G yacy -H -D yacy # Set ownership of yacy install directory to yacy user/group RUN chown yacy:yacy -R /opt/yacy_search_server -# Expose port 8090 -EXPOSE 8090 +# Expose HTTP and HTTPS default ports +EXPOSE 8090 8443 # Set data volume : yacy data and configuration will persist aven after container stop or destruction VOLUME ["/opt/yacy_search_server/DATA"] diff --git a/docker/docker-cloud.yml b/docker/docker-cloud.yml index 24a09a69b..97e965bd7 100644 --- a/docker/docker-cloud.yml +++ b/docker/docker-cloud.yml @@ -2,5 +2,6 @@ yacy: image: 'luccioman/yacy:latest' ports: - '8090:8090' + - '8443:8443' restart: on-failure autoredeploy: true \ No newline at end of file From 16dfc49bfd84a44ae4ba3749607e9ac6f3bb7507 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 5 Aug 2016 11:57:38 +0200 Subject: [PATCH 02/28] Enabled HTTPS as default, and added HTPS related documentation --- docker/Dockerfile | 7 ++++++ docker/Dockerfile.alpine | 7 ++++++ docker/Readme.md | 49 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 60 insertions(+), 3 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index a090c75e8..401d1f1d0 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -18,6 +18,10 @@ WORKDIR /opt # - Compile with ant # - remove unnecessary and size consuming .git directory # - remove ant and git packages + +# Possible alternative : copy directly your current sources an remove git clone command from the following RUN +# COPY . /opt/yacy_search_server/ + RUN apt-get update && \ apt-get install -yq ant git && \ git clone https://github.com/yacy/yacy_search_server.git && \ @@ -30,6 +34,9 @@ RUN apt-get update && \ # Set initial admin password : "docker" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex()) RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:e672161ffdce91be4678605f4f4e6786" /opt/yacy_search_server/defaults/yacy.init +# Intially enable HTTPS : this is the most secure option for remote administrator authentication +RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init + # Create user and group yacy : this user will be used to run YaCy main process RUN adduser --system --group --no-create-home --disabled-password yacy diff --git a/docker/Dockerfile.alpine b/docker/Dockerfile.alpine index eb913af31..77134f222 100755 --- a/docker/Dockerfile.alpine +++ b/docker/Dockerfile.alpine @@ -51,6 +51,10 @@ WORKDIR /opt # - compile with apache ant # - remove unnecessary and size consuming .git directory # - delete git package and ant binary install + +# Possible alternative : copy directly your current sources an remove git clone command from the following RUN +# COPY . /opt/yacy_search_server/ + RUN apk update && \ apk add --no-cache git && \ git clone https://github.com/yacy/yacy_search_server.git && \ @@ -62,6 +66,9 @@ RUN apk update && \ # Set initial admin password : "docker" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex()) RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:e672161ffdce91be4678605f4f4e6786" /opt/yacy_search_server/defaults/yacy.init +# Intially enable HTTPS : this is the most secure option for remote administrator authentication +RUN sed -i "/server.https=false/c\server.https=true" /opt/yacy_search_server/defaults/yacy.init + # Create user and group yacy : this user will be used to run YaCy main process RUN addgroup yacy && adduser -S -G yacy -H -D yacy diff --git a/docker/Readme.md b/docker/Readme.md index 7471d05ae..53228ae44 100755 --- a/docker/Readme.md +++ b/docker/Readme.md @@ -20,6 +20,11 @@ Using yacy_search_server/docker/Dockerfile : cd yacy_search_server/docker docker build . +To build the Alpine variant : + + cd yacy_search_server/docker + docker build -f Dockerfile.alpine . + ## Image variants `luccioman/yacy:latest` @@ -51,12 +56,12 @@ You can retrieve the container IP address with `docker inspect`. #### Easier to handle - docker run --name yacy -p 8090:8090 --log-opt max-size=100m --log-opt max-file=2 luccioman/yacy + docker run --name yacy -p 8090:8090 -p 8443:8443 --log-opt max-size=200m --log-opt max-file=2 luccioman/yacy ##### Options detail * --name : allow easier management of your container (without it, docker automatically generate a new name at each startup). -* -p : map host port and container port, allowing web interface access through the usual http://localhost:8090. +* -p 8090:8090 -p 8443:8443 : map host ports to YaCy container ports, allowing web interface access through the usual http://localhost:8090 and https://localhost:8443 (you can set a different mapping, for example -p 443:8443 if you prefer to use the default HTTPS port on your host) * --log-opt max-size : limit maximum docker log file size for this container * --log-opt max-file : limit number of docker rotated log files for this container @@ -81,6 +86,44 @@ Note that you can list all docker volumes with : #### As background process docker run -d luccioman/yacy + +### HTTPS support + +This images are default configured with HTTPS enabled, and use a default certificate stored in defaults/freeworldKeystore. You should use your own certificated. In order to do it, you can proceed as follow. + +#### Self-signed certificate + +A self-signed certificate will provide encrypted communications with your YaCy server, but browsers will still complain about an invalid security certificate with the error "SEC_ERROR_UNKNOWN_ISSUER". If it is sufficient for you, you can add permanently add exception to your browser. + +This kind of certificate can be generated and added to your YaCy Docker container with the following : + + keytool -keystore /var/lib/docker/volumes/[your_yacy_volume]/_data/SETTINGS/yacykeystore -genkey -keyalg RSA -alias yacycert + +Then edit YaCy config file. For example with the nano text editor : + + nano /var/lib/docker/volumes/[your_yacy_volume]/_data/SETTINGS/yacy.conf + +And configure the keyStoreXXXX properties accordingly : + + keyStore=/opt/yacy_search_server/DATA/SETTINGS/yacykeystore + keyStorePassword=yourpassword + +#### Import an existing certificate: + +Importing a certificated validated by a certification authority (CA) will ensure you full HTTPS support with no security errors when accessing your YaCy peer. You can import an existing certificate in pkcs12 format. + +First copy it to the YaCy Docker container volume : + + cp [yourStore].pkcs12 /var/lib/docker/volumes/[your_yacy_volume]/_data/SETTINGS/[yourStore].pkcs12 + +Then edit YaCy config file. For example with the nano text editor : + + nano /var/lib/docker/volumes/[your_yacy_volume]/_data/SETTINGS/yacy.conf + +And configure the pkcs12XXX properties accordingly : + + pkcs12ImportFile=/opt/yacy_search_server/DATA/SETTINGS/[yourStore].pkcs12 + pkcs12ImportPwd=yourpassword ### Next starts @@ -111,7 +154,7 @@ OR Create new container based on pulled image, using volume data from old container : - docker create --name [tmp-container_name] -p 8090:8090 --volumes-from=[container_name] luccioman/yacy:latest + docker create --name [tmp-container_name] -p 8090:8090 -p 8443:8443 --volumes-from=[container_name] luccioman/yacy:latest Stop old container : From 75254ac9b60c5f9425318eab3b9c212f4541ede7 Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 5 Aug 2016 12:16:11 +0200 Subject: [PATCH 03/28] Fixed syntax errors. --- docker/Readme.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docker/Readme.md b/docker/Readme.md index 53228ae44..9da94ccf8 100755 --- a/docker/Readme.md +++ b/docker/Readme.md @@ -83,17 +83,17 @@ Note that you can list all docker volumes with : docker volume ls -#### As background process +#### Start as background process docker run -d luccioman/yacy ### HTTPS support -This images are default configured with HTTPS enabled, and use a default certificate stored in defaults/freeworldKeystore. You should use your own certificated. In order to do it, you can proceed as follow. +This images are default configured with HTTPS enabled, and use a default certificate stored in defaults/freeworldKeystore. You should use your own certificate. In order to do it, you can proceed as follow. #### Self-signed certificate -A self-signed certificate will provide encrypted communications with your YaCy server, but browsers will still complain about an invalid security certificate with the error "SEC_ERROR_UNKNOWN_ISSUER". If it is sufficient for you, you can add permanently add exception to your browser. +A self-signed certificate will provide encrypted communications with your YaCy server, but browsers will still complain about an invalid security certificate with the error "SEC_ERROR_UNKNOWN_ISSUER". If it is sufficient for you, you can permanently add and exception to your browser. This kind of certificate can be generated and added to your YaCy Docker container with the following : @@ -110,7 +110,7 @@ And configure the keyStoreXXXX properties accordingly : #### Import an existing certificate: -Importing a certificated validated by a certification authority (CA) will ensure you full HTTPS support with no security errors when accessing your YaCy peer. You can import an existing certificate in pkcs12 format. +Importing a certificate validated by a certification authority (CA) will ensure you have full HTTPS support with no security errors when accessing your YaCy peer. You can import an existing certificate in pkcs12 format. First copy it to the YaCy Docker container volume : From 7c0f1106a6d2d9de0aded56f783d0eb065c575c9 Mon Sep 17 00:00:00 2001 From: reger Date: Thu, 1 Sep 2016 20:33:28 +0200 Subject: [PATCH 04/28] upd master.lng for RankingSolr_p.html (add Filter Query txt) --- locales/master.lng.xlf | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 683fe7228..47d038e9a 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -6821,6 +6821,30 @@ "Re-Set to default" + + >Filter Query< + + + The Filter Query is attached to every query. + + + Use this to statically add a selection criteria to reduce the set of results. + + + Example: "http_unique_b:true AND www_unique_b:true" will filter out all results where urls appear also with/without http(s) and/or with/without 'www.' prefix. + + + To find appropriate fields for this query, see the + + + YaCy Solr Schema + + + Warning: bad expressions here will cause that you don't have any search result! + + + "Set Filter Query" + >Boost Query< From f3f478448bab001849d3d621c159a2a0de8758cc Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 2 Sep 2016 11:22:39 +0200 Subject: [PATCH 05/28] Explicitely set YaCy data folder when starting in MacOS bundle --- addon/YaCy.app/Contents/Info.plist | 2 +- addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh | 8 ++++++++ build.xml | 3 ++- startYACY.sh | 17 +++++++++++++++-- 4 files changed, 26 insertions(+), 4 deletions(-) create mode 100755 addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh diff --git a/addon/YaCy.app/Contents/Info.plist b/addon/YaCy.app/Contents/Info.plist index 69804aa9f..5f772f193 100644 --- a/addon/YaCy.app/Contents/Info.plist +++ b/addon/YaCy.app/Contents/Info.plist @@ -19,7 +19,7 @@ CFBundleAllowMixedLocalizations true CFBundleExecutable -startYACY.sh +startYACYMacOS.sh CFBundleDevelopmentRegion English CFBundlePackageType diff --git a/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh b/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh new file mode 100755 index 000000000..456ddea4e --- /dev/null +++ b/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh @@ -0,0 +1,8 @@ +#!/usr/bin/env sh + +# Launcher for YaCy in a MacOS bundle : +# rely on the generic startYACY.sh, but specifies the user home relative path for YaCy data +# This data directory is set in conforming to OS X File System Programming Guide +# see : https://developer.apple.com/library/ios/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/MacOSXDirectories/MacOSXDirectories.html + +"`dirname $0`"/startYACY.sh -startup "'Library/Application Support/net.yacy.YaCy'" diff --git a/build.xml b/build.xml index 879296ef4..14acd7b66 100644 --- a/build.xml +++ b/build.xml @@ -764,7 +764,8 @@ - + + diff --git a/startYACY.sh b/startYACY.sh index 8763b9086..567cc5e89 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -40,6 +40,7 @@ Options -l, --logging save the output of YaCy to yacy.log -d, --debug show the output of YaCy on the console -p, --print-out only print the command, which would be executed to start YaCy + -start, -startup [data-path] start YaCy using the specified data folder path, relative to the current user home -g, --gui start a gui for YaCy USAGE } @@ -101,6 +102,10 @@ for option in $options;do -t|--tail-log) TAILLOG=1 ;; + -start|-startup) + STARTUP=1 + isparameter=1 + ;; -g|--gui) GUI=1 isparameter=1 @@ -111,7 +116,11 @@ for option in $options;do isparameter=1; continue else - parameter="$parameter $option" + if [ $parameter ];then + parameter="$parameter $option" + else + parameter="$option" + fi fi fi #parameter or option? done @@ -189,7 +198,11 @@ for N in lib/*.jar; do CLASSPATH="$CLASSPATH$N:"; done CLASSPATH=".:$CLASSPATH" cmdline="$JAVA $JAVA_ARGS -classpath $CLASSPATH net.yacy.yacy"; -if [ $GUI -eq 1 ] #gui + +if [ $STARTUP -eq 1 ] #startup +then + cmdline="$cmdline -startup $parameter" +elif [ $GUI -eq 1 ] #gui then cmdline="$cmdline -gui $parameter" fi From 1dc4306058a3354f61435d403561829590fc4e9d Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 2 Sep 2016 11:23:02 +0200 Subject: [PATCH 06/28] Fixed indentation for better readability. --- source/net/yacy/yacy.java | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/source/net/yacy/yacy.java b/source/net/yacy/yacy.java index db7bbce51..44803ec6e 100644 --- a/source/net/yacy/yacy.java +++ b/source/net/yacy/yacy.java @@ -713,13 +713,17 @@ public final class yacy { //System.out.print("args=["); for (int i = 0; i < args.length; i++) System.out.print(args[i] + ", "); System.out.println("]"); if ((args.length >= 1) && (args[0].toLowerCase().equals("-startup") || args[0].equals("-start"))) { // normal start-up of yacy - if (args.length > 1) dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]); - preReadSavedConfigandInit(dataRoot); + if (args.length > 1) { + dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]); + } + preReadSavedConfigandInit(dataRoot); startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false); } else if (args.length >= 1 && args[0].toLowerCase().equals("-gui")) { // start-up of yacy with gui - if (args.length > 1) dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]); - preReadSavedConfigandInit(dataRoot); + if (args.length > 1) { + dataRoot = new File(System.getProperty("user.home").replace('\\', '/'), args[1]); + } + preReadSavedConfigandInit(dataRoot); startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, true); } else if ((args.length >= 1) && ((args[0].toLowerCase().equals("-shutdown")) || (args[0].equals("-stop")))) { // normal shutdown of yacy @@ -732,7 +736,7 @@ public final class yacy { } else if ((args.length >= 1) && (args[0].toLowerCase().equals("-version"))) { // show yacy version System.out.println(copyright); - } else if ((args.length > 1) && (args[0].toLowerCase().equals("-config"))) { + } else if ((args.length > 1) && (args[0].toLowerCase().equals("-config"))) { // set config parameter. Special handling of adminAccount=user:pwd (generates md5 encoded password) // on Windows parameter should be enclosed in doublequotes to accept = sign (e.g. -config "port=8090" "port.ssl=8043") File f = new File (dataRoot,"DATA/SETTINGS/"); @@ -778,9 +782,11 @@ public final class yacy { } System.out.println(); } - } else { - if (args.length == 1) applicationRoot= new File(args[0]); - preReadSavedConfigandInit(dataRoot); + } else { + if (args.length == 1) { + applicationRoot= new File(args[0]); + } + preReadSavedConfigandInit(dataRoot); startup(dataRoot, applicationRoot, startupMemFree, startupMemTotal, false); } } finally { From 24b87412921c6e247d28e7fbe19b388d4124e71e Mon Sep 17 00:00:00 2001 From: luccioman Date: Fri, 2 Sep 2016 11:55:46 +0200 Subject: [PATCH 07/28] Fix for startup option - Var initialization - Declaration in getopt --- startYACY.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/startYACY.sh b/startYACY.sh index 567cc5e89..5e52a9292 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -40,7 +40,7 @@ Options -l, --logging save the output of YaCy to yacy.log -d, --debug show the output of YaCy on the console -p, --print-out only print the command, which would be executed to start YaCy - -start, -startup [data-path] start YaCy using the specified data folder path, relative to the current user home + --start, --startup [data-path] start YaCy using the specified data folder path, relative to the current user home -g, --gui start a gui for YaCy USAGE } @@ -57,7 +57,7 @@ then options="`getopt hdlptg: $*`" else - options="`getopt -n YaCy -o h,d,l,p,t,g -l help,debug,logging,print-out,tail-log,gui -- $@`" + options="`getopt -n YaCy -o h,d,l,p,t,g -l help,debug,logging,print-out,tail-log,gui,start,startup -- $@`" fi if [ $? -ne 0 ];then @@ -72,6 +72,7 @@ LOGGING=0 DEBUG=0 PRINTONLY=0 TAILLOG=0 +STARTUP=0 GUI=0 for option in $options;do if [ $isparameter -ne 1 ];then #option @@ -202,8 +203,7 @@ cmdline="$JAVA $JAVA_ARGS -classpath $CLASSPATH net.yacy.yacy"; if [ $STARTUP -eq 1 ] #startup then cmdline="$cmdline -startup $parameter" -elif [ $GUI -eq 1 ] #gui -then +elif [ $GUI -eq 1 ];then #gui cmdline="$cmdline -gui $parameter" fi if [ $DEBUG -eq 1 ] #debug From 6801673a07cb2740318c5ec1ac7b9bbe930d521c Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 3 Sep 2016 03:37:40 +0200 Subject: [PATCH 08/28] apply postranking media search boost only on media queries --- source/net/yacy/search/query/SearchEvent.java | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 6585a5060..11a322a4f 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1439,10 +1439,19 @@ public final class SearchEvent { long r = 0; // for media search: prefer pages with many links - r += rentry.limage() << this.query.ranking.coeff_cathasimage; - r += rentry.laudio() << this.query.ranking.coeff_cathasaudio; - r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo; - r += rentry.lapp() << this.query.ranking.coeff_cathasapp; + switch (this.query.contentdom) { + case IMAGE: + r += rentry.limage() << this.query.ranking.coeff_cathasimage; + break; + case AUDIO: + r += rentry.laudio() << this.query.ranking.coeff_cathasaudio; + break; + case VIDEO: + r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo; + break; + case APP: + r += rentry.lapp() << this.query.ranking.coeff_cathasapp; + } // apply citation count //System.out.println("POSTRANKING CITATION: references = " + rentry.referencesCount() + ", inbound = " + rentry.llocal() + ", outbound = " + rentry.lother()); From 421a6e3a95ee7db257a79c35c5bb3bf6067d676b Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 3 Sep 2016 14:46:58 +0200 Subject: [PATCH 09/28] Fixed options processing for Mac OS - getopt is BSD style and does not support long options - fixed typing error inparameter value extracting for all platforms --- addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh | 2 +- startYACY.sh | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh b/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh index 456ddea4e..692ec913f 100755 --- a/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh +++ b/addon/YaCy.app/Contents/MacOS/startYACYMacOS.sh @@ -5,4 +5,4 @@ # This data directory is set in conforming to OS X File System Programming Guide # see : https://developer.apple.com/library/ios/documentation/FileManagement/Conceptual/FileSystemProgrammingGuide/MacOSXDirectories/MacOSXDirectories.html -"`dirname $0`"/startYACY.sh -startup "'Library/Application Support/net.yacy.YaCy'" +"`dirname $0`"/startYACY.sh -s "'Library/Application Support/net.yacy.YaCy'" diff --git a/startYACY.sh b/startYACY.sh index 5e52a9292..f935ae2e3 100755 --- a/startYACY.sh +++ b/startYACY.sh @@ -40,7 +40,7 @@ Options -l, --logging save the output of YaCy to yacy.log -d, --debug show the output of YaCy on the console -p, --print-out only print the command, which would be executed to start YaCy - --start, --startup [data-path] start YaCy using the specified data folder path, relative to the current user home + -s, --startup [data-path] start YaCy using the specified data folder path, relative to the current user home -g, --gui start a gui for YaCy USAGE } @@ -48,16 +48,16 @@ USAGE #startup YaCy cd "`dirname $0`" -if [ $OS = "OpenBSD" ] +if [ $OS = "OpenBSD" ] || [ $OS = "Darwin" ] then if [ $(echo $@ | grep -o "\-\-" | wc -l) -ne 0 ] then echo "WARNING: Unfortunately this script does not support long options in $OS." fi - options="`getopt hdlptg: $*`" + options="`getopt hdlptsg: $*`" else - options="`getopt -n YaCy -o h,d,l,p,t,g -l help,debug,logging,print-out,tail-log,gui,start,startup -- $@`" + options="`getopt -n YaCy -o h,d,l,p,t,s,g -l help,debug,logging,print-out,tail-log,startup,gui -- $@`" fi if [ $? -ne 0 ];then @@ -103,7 +103,7 @@ for option in $options;do -t|--tail-log) TAILLOG=1 ;; - -start|-startup) + -s|-startup) STARTUP=1 isparameter=1 ;; @@ -113,7 +113,7 @@ for option in $options;do ;; esac #case option else #parameter - if [ x$option = "--" ];then #option / parameter separator + if [ $option = "--" ];then #option / parameter separator isparameter=1; continue else From 8255e91c994a6bc85817dd4943f7e78df34b640d Mon Sep 17 00:00:00 2001 From: luccioman Date: Sat, 3 Sep 2016 15:21:02 +0200 Subject: [PATCH 10/28] Fixed serverClassLoader.findClass method htroot is a supposed to be a subfolder of appPath and not of dataPath, as assumed in other places where htroot is loaded. This issue was not visible when dataPath and appPath are equals. --- source/net/yacy/server/serverClassLoader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/source/net/yacy/server/serverClassLoader.java b/source/net/yacy/server/serverClassLoader.java index 9412dc4be..cd627a22c 100644 --- a/source/net/yacy/server/serverClassLoader.java +++ b/source/net/yacy/server/serverClassLoader.java @@ -64,7 +64,7 @@ public final class serverClassLoader extends ClassLoader { @Override protected Class findClass(String classname) throws ClassNotFoundException { // construct path to htroot for a servletname - File cpath = new File (Switchboard.getSwitchboard().getDataPath(SwitchboardConstants.HTROOT_PATH, SwitchboardConstants.HTROOT_PATH_DEFAULT),classname+".class"); + File cpath = new File (Switchboard.getSwitchboard().getAppPath(SwitchboardConstants.HTROOT_PATH, SwitchboardConstants.HTROOT_PATH_DEFAULT),classname+".class"); return loadClass(cpath); } From cc2d9dd3f17eba482c9baf002c87abaa93371423 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 00:09:45 +0200 Subject: [PATCH 11/28] reactivate the use of included-in-topwords boost in postRanking + changed the postRanking to add one score only if word appears more as one time. + getTopics() unused code block rem'd (save performace)-> routine needs rework ! --- source/net/yacy/search/query/SearchEvent.java | 40 ++++++++++++++----- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 11a322a4f..579267968 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1428,7 +1428,11 @@ public final class SearchEvent { */ public void addResult(URIMetadataNode resultEntry, final float score) { if (resultEntry == null) return; - final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, new ConcurrentScoreMap() /*this.snippetProcess.rankingProcess.getTopicNavigator(10)*/); + final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, this.ref /*this.getTopicNavigator(MAX_TOPWORDS)*/); + // TODO: above was originally using (see below), but getTopicNavigator returns this.ref and possibliy alters this.ref on first call (this.ref.size < 2 -> this.ref.clear) + // TODO: verify and straighten the use of addTopic, getTopic and getTopicNavigator and related score calculation + // final long ranking = ((long) (score * 128.f)) + postRanking(resultEntry, this.getTopicNavigator(MAX_TOPWORDS)); + resultEntry.setScore(ranking); // update the score of resultEntry for access by search interface / api this.resultList.put(new ReverseElement(resultEntry, ranking)); // remove smallest in case of overflow if (pollImmediately) this.resultList.poll(); // prevent re-ranking in case there is only a single index source which has already ranked entries. @@ -1467,24 +1471,27 @@ public final class SearchEvent { final String urlstring = rentry.url().toNormalform(true); final String[] urlcomps = MultiProtocolURL.urlComps(urlstring); final String[] descrcomps = MultiProtocolURL.splitpattern.split(rentry.title().toLowerCase()); - for (final String urlcomp : urlcomps) { + + // apply query-in-result matching + final QueryGoal.NormalizedWords urlcompmap = new QueryGoal.NormalizedWords(urlcomps); + final QueryGoal.NormalizedWords descrcompmap = new QueryGoal.NormalizedWords(descrcomps); + // the token map is used (instead of urlcomps/descrcomps) to determine appearance in url/title and eliminate double occurances + // (example Title="News News News News News News - today is party -- News News News News News News" to add one score instead of 12 * score !) + for (final String urlcomp : urlcompmap) { int tc = topwords.get(urlcomp); if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_urlcompintoplist; } - for (final String descrcomp : descrcomps) { + for (final String descrcomp : descrcompmap) { int tc = topwords.get(descrcomp); if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_descrcompintoplist; } - // apply query-in-result matching - final QueryGoal.NormalizedWords urlcomph = new QueryGoal.NormalizedWords(urlcomps); - final QueryGoal.NormalizedWords descrcomph = new QueryGoal.NormalizedWords(descrcomps); final Iterator shi = this.query.getQueryGoal().getIncludeWords(); String queryword; while (shi.hasNext()) { queryword = shi.next(); - if (urlcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl; - if (descrcomph.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title; + if (urlcompmap.contains(queryword)) r += 256 << this.query.ranking.coeff_appurl; + if (descrcompmap.contains(queryword)) r += 256 << this.query.ranking.coeff_app_dc_title; } return r; } @@ -1827,14 +1834,24 @@ public final class SearchEvent { // this is only available if execQuery() was called before return this.localSearchInclusion; } - + + /** + * create a list of words that had been computed by statistics over all + * words that appeared in the url or the description of all urls + * + * @param maxcount max number of topwords to return + * @param maxtime max time allowed to use + * @return + */ public ScoreMap getTopics(final int maxcount, final long maxtime) { - // create a list of words that had been computed by statistics over all - // words that appeared in the url or the description of all urls final ScoreMap result = new ConcurrentScoreMap(); if ( this.ref.sizeSmaller(2) ) { this.ref.clear(); // navigators with one entry are not useful } + /* ---------------------------------- start of rem (2016-09-03) + // TODO: result map is not used currently, verify if it should and use or delete this code block + // TODO: as it is not used now - in favour of performance this code block is rem'ed (2016-09-03) + final Map counts = new HashMap(); final Iterator i = this.ref.keys(false); String word; @@ -1860,6 +1877,7 @@ public final class SearchEvent { result.set(ce.getKey(), (int) (((double) maxcount) * (ce.getValue() - min) / (max - min))); } } + /* ------------------------------------ end of rem (2016-09-03) */ return this.ref; } From 47391678e7bc130cfc6d5fb76337b4ef81a64b8d Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 01:00:28 +0200 Subject: [PATCH 12/28] TranslationNews: take out limitation to send only one text per translated file (to avoid need of repeated publish button hits) --- htroot/TransNews_p.java | 26 +++----------------------- 1 file changed, 3 insertions(+), 23 deletions(-) diff --git a/htroot/TransNews_p.java b/htroot/TransNews_p.java index 5a2ab53aa..468d92adc 100644 --- a/htroot/TransNews_p.java +++ b/htroot/TransNews_p.java @@ -42,7 +42,6 @@ import net.yacy.server.serverObjects; import net.yacy.server.serverSwitch; import net.yacy.utils.crypt; import net.yacy.utils.translation.TranslationManager; -import net.yacy.utils.translation.TranslatorXliff; public class TransNews_p { @@ -90,32 +89,13 @@ public class TransNews_p { continue; } if (NewsPool.CATEGORY_TRANSLATION_ADD.equals(rtmp.category())) { - //String tmplng = rtmp.attribute("language", null); + String tmplng = rtmp.attribute("language", null); String tmpfile = rtmp.attribute("file", null); String tmpsource = rtmp.attribute("source", null); - String tmptarget = rtmp.attribute("target", null); + //String tmptarget = rtmp.attribute("target", null); - if (sb.peers.mySeed().hash.equals(rtmp.originator())) { - /* - if (tmplng != null && tmplng.equals(currentlang)) { - sendit = false; - break; - }*/ - if (tmpfile != null && tmpfile.equals(file)) { - sendit = false; - break; - } - if (tmpsource != null && tmpsource.equals(sourcetxt)) { - sendit = false; - break; - } - if (tmptarget != null && tmptarget.equals(targettxt)) { - sendit = false; - break; - } - } // if news with file and source exist (maybe from other peer) - skip sending another msg (to avoid confusion) - if ((tmpfile != null && tmpfile.equals(file)) + if ((tmplng != null && tmplng.equals(currentlang)) && (tmpfile != null && tmpfile.equals(file)) && (tmpsource != null && tmpsource.equals(sourcetxt))) { sendit = false; break; From a2777903d64f662cc6eb00d884e859cfc9a93c5e Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 02:29:04 +0200 Subject: [PATCH 13/28] include translation news service in status submenue + display translation proposal news only for current language (in TransNews servlet) --- htroot/News.html | 4 ++++ htroot/TransNews_p.html | 2 +- htroot/TransNews_p.java | 1 + .../env/templates/submenuComputation.template | 1 + locales/master.lng.xlf | 18 ++++++++++++++++++ 5 files changed, 25 insertions(+), 1 deletion(-) diff --git a/htroot/News.html b/htroot/News.html index 1c2b83507..320717d3d 100644 --- a/htroot/News.html +++ b/htroot/News.html @@ -43,6 +43,10 @@ A change in the personal profile will create a news entry. You can see recently made changes of profile entries on the Network page, where that profile change is visualized with a '*' beside the 'P' (profile) - selector. +
  • + Publishing of added or modified translation for the user interface. Other peers may include it in their local translation list. + To publish a translation, use the integrated translation editor to add a translation and publish it afterwards. +
  • More news services will follow. diff --git a/htroot/TransNews_p.html b/htroot/TransNews_p.html index d5b755c02..7e69deedb 100644 --- a/htroot/TransNews_p.html +++ b/htroot/TransNews_p.html @@ -6,7 +6,7 @@ #%env/templates/header.template%# - + #%env/templates/submenuComputation.template%#

    Translation News for Language #[currentlang]#

    diff --git a/htroot/TransNews_p.java b/htroot/TransNews_p.java index 468d92adc..d570cf363 100644 --- a/htroot/TransNews_p.java +++ b/htroot/TransNews_p.java @@ -200,6 +200,7 @@ public class TransNews_p { continue; } + if (!lang.equals(currentlang)) continue; String existingtarget = null; //transMgr.getTranslation(filename, source); Map tmpMap = localTrans.get(filename); diff --git a/htroot/env/templates/submenuComputation.template b/htroot/env/templates/submenuComputation.template index b6062bcf8..5a112d2ce 100644 --- a/htroot/env/templates/submenuComputation.template +++ b/htroot/env/templates/submenuComputation.template @@ -33,6 +33,7 @@ diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 47d038e9a..fec0c7550 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -5966,6 +5966,21 @@ profile entries on the Network page, where that profile change is visualized with a '*' beside the 'P' (profile) - selector. + + Publishing of added or modified translation for the user interface. + + + Other peers may include it in their local translation list. + + + To publish a translation, use the integrated + + + translation editor + + + to add a translation and publish it afterwards. + Above you can see four menues: @@ -10124,6 +10139,9 @@ >Local Peer Wiki< + + UI Translations + From f34b493ab639e2a28320c8435f4290cf53b56ca3 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 03:05:25 +0200 Subject: [PATCH 14/28] fix fr.lng (missing quotes) broke sentence appart to reduce inclusion of coding tags in translation --- locales/fr.lng | 10 ++++++++-- locales/master.lng.xlf | 18 ++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/locales/fr.lng b/locales/fr.lng index 912f628f0..9d81c49c6 100644 --- a/locales/fr.lng +++ b/locales/fr.lng @@ -1212,8 +1212,14 @@ Category==Categorie Received==Reçu Distributed==Distribué Attributes==Attributs -"#(page)#::Process Selected News::Delete Selected News::Abort Publication of Selected News::Delete Selected News#(/page)#"==#(page)#::Traiter les nouvelles sélectionnées::Supprimer les nouvelles sélectionnées::Annuler la publication des nouvelles sélectionnées::Supprimer les nouvelles sélectionées#(/page)# -"#(page)#::Process All News::Delete All News::Abort Publication of All News::Delete All News#(/page)#"==#(page)#::Traiter toutes les nouvelles::Supprimer toutes les nouvelles::Annuler la publication de toutes les nouvelles::Supprimer toutes les nouvelles#(/page)# +Process Selected News==Traiter les nouvelles sélectionnées +Delete Selected News==Supprimer les nouvelles sélectionnées +Abort Publication of Selected News==Annuler la publication des nouvelles sélectionnées +Delete Selected News==Supprimer les nouvelles sélectionées +Process All News==Traiter toutes les nouvelles +Delete All News==Supprimer toutes les nouvelles +Abort Publication of All News==Annuler la publication de toutes les nouvelles +Delete All News==Supprimer toutes les nouvelles #----------------------------- #File: Performance_p.html diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index fec0c7550..e31b52804 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -6023,6 +6023,24 @@ Attributes + + Process Selected News + + + Delete Selected News + + + Abort Publication of Selected News + + + Process All News + + + Delete All News + + + Abort Publication of All News + "#(page)#::Process Selected News::Delete Selected News::Abort Publication of Selected News::Delete Selected News#(/page)#" From ebf818ad9597009a68b6f7a925b42a2a6f821bd4 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 06:42:48 +0200 Subject: [PATCH 15/28] log a error on aborted news publish (due to duplicate news.id) + change printed err msg to log entry in PeerAction.processPeerArrival --- htroot/TransNews_p.java | 6 +++--- source/net/yacy/peers/NewsPool.java | 2 ++ source/net/yacy/peers/PeerActions.java | 3 +-- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/htroot/TransNews_p.java b/htroot/TransNews_p.java index d570cf363..98c432afd 100644 --- a/htroot/TransNews_p.java +++ b/htroot/TransNews_p.java @@ -169,8 +169,8 @@ public class TransNews_p { final HashMap positiveHashes = new HashMap(); // a mapping from an url hash to Integer (count of votes) accumulateVotes(sb, negativeHashes, positiveHashes, NewsPool.INCOMING_DB); final ScoreMap ranking = new ConcurrentScoreMap(); // score cluster for url hashes - final HashMap Translation = new HashMap(); // a mapping from an url hash to a kelondroRow.Entry with display properties - accumulateTranslations(sb, Translation, ranking, negativeHashes, positiveHashes, NewsPool.INCOMING_DB); + final HashMap translation = new HashMap(); // a mapping from an url hash to a kelondroRow.Entry with display properties + accumulateTranslations(sb, translation, ranking, negativeHashes, positiveHashes, NewsPool.INCOMING_DB); // read out translation-news array and create property entries final Iterator k = ranking.keys(false); @@ -187,7 +187,7 @@ public class TransNews_p { continue; } - row = Translation.get(refid); + row = translation.get(refid); if (row == null) { continue; } diff --git a/source/net/yacy/peers/NewsPool.java b/source/net/yacy/peers/NewsPool.java index 1f13232f8..a16a8bcce 100644 --- a/source/net/yacy/peers/NewsPool.java +++ b/source/net/yacy/peers/NewsPool.java @@ -325,6 +325,8 @@ public class NewsPool { if (this.newsDB.get(record.id()) == null) { this.incomingNews.push(record); // we want to see our own news.. this.outgoingNews.push(record); // .. and put it on the publishing list + } else { + ConcurrentLog.info("NewsPool", "publishing of news aborted, news with same id (time + originator) exists id=" + record.id()); } } catch (final Exception e) { ConcurrentLog.logException(e); diff --git a/source/net/yacy/peers/PeerActions.java b/source/net/yacy/peers/PeerActions.java index b249d5c95..46bb263aa 100644 --- a/source/net/yacy/peers/PeerActions.java +++ b/source/net/yacy/peers/PeerActions.java @@ -30,7 +30,6 @@ import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.feed.RSSMessage; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.kelondro.util.MapTools; -import net.yacy.peers.operation.yacyVersion; public class PeerActions { @@ -261,7 +260,7 @@ public class PeerActions { final String cre1 = MapTools.string2map(decodedString, ",").get("cre"); final String cre2 = MapTools.string2map(record.toString(), ",").get("cre"); if ((cre1 == null) || (cre2 == null) || (!(cre1.equals(cre2)))) { - System.out.println("### ERROR - cre are not equal: cre1=" + cre1 + ", cre2=" + cre2); + Network.log.warn("processPeerArrival: ### ERROR - message creation date verification not equal: cre1=" + cre1 + ", cre2=" + cre2); return; } try { From 39dd24469387a84933277e7580b77aea46cf0f57 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 4 Sep 2016 22:18:07 +0200 Subject: [PATCH 16/28] fix ConcurrentScoreMap.set() calculation of totalCount() + test case --- .../yacy/cora/sorting/ConcurrentScoreMap.java | 5 ++- .../cora/sorting/ConcurrentScoreMapTest.java | 34 +++++++++++++++++++ 2 files changed, 38 insertions(+), 1 deletion(-) create mode 100644 test/java/net/yacy/cora/sorting/ConcurrentScoreMapTest.java diff --git a/source/net/yacy/cora/sorting/ConcurrentScoreMap.java b/source/net/yacy/cora/sorting/ConcurrentScoreMap.java index 95b2ce3b8..77304e576 100644 --- a/source/net/yacy/cora/sorting/ConcurrentScoreMap.java +++ b/source/net/yacy/cora/sorting/ConcurrentScoreMap.java @@ -133,7 +133,10 @@ public class ConcurrentScoreMap extends AbstractScoreMap implements ScoreM if (obj == null) return; // use atomic operations - this.map.putIfAbsent(obj, new AtomicLong(0)); + final AtomicLong old = this.map.putIfAbsent(obj, new AtomicLong(0)); + // adjust overall counter if value replaced + if (old != null) this.gcount -= old.longValue(); // must use old befor setting a new value (it's a object reference) + this.map.get(obj).set(newScore); // increase overall counter diff --git a/test/java/net/yacy/cora/sorting/ConcurrentScoreMapTest.java b/test/java/net/yacy/cora/sorting/ConcurrentScoreMapTest.java new file mode 100644 index 000000000..443a5436a --- /dev/null +++ b/test/java/net/yacy/cora/sorting/ConcurrentScoreMapTest.java @@ -0,0 +1,34 @@ + +package net.yacy.cora.sorting; + +import java.util.Iterator; +import static org.junit.Assert.assertEquals; +import org.junit.Test; + + +public class ConcurrentScoreMapTest { + + /** + * Test of totalCount method, of class ConcurrentScoreMap. + */ + @Test + public void testTotalCount() { + final ConcurrentScoreMap csm = new ConcurrentScoreMap(); + csm.set("first", 10); + csm.set("second", 5); + csm.set("third", 13); + + csm.set("first", 100); + + final Iterator it = csm.keys(true); + long sum = 0; + while (it.hasNext()) { + String x = it.next(); + long val = csm.get(x); + sum += val; + } + + assertEquals(sum, csm.totalCount()); + } + +} From 51c077f49351f144c71cdefc5ea237bad16622f6 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 5 Sep 2016 00:07:01 +0200 Subject: [PATCH 17/28] adjust the getTopics() and getTopicNavigator() to current useage - move the maxcount limit restriction completely to getTopicNavigator (as there not used in getTopics) - let search servlet use getTopics by default (w/o RWI connected check, as of now, Topics are available w/o any additional index interaction) --- htroot/yacy/search.java | 4 +- source/net/yacy/search/query/SearchEvent.java | 52 +++++++++++++++---- 2 files changed, 44 insertions(+), 12 deletions(-) diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 0c831c5dd..c208e4282 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -380,7 +380,9 @@ public final class search { // prepare reference hints final long timer = System.currentTimeMillis(); - final ScoreMap topicNavigator = sb.index.connectedRWI() ? theSearch.getTopics(5, 100) : new ConcurrentScoreMap(); + //final ScoreMap topicNavigator = sb.index.connectedRWI() ? theSearch.getTopics(5, 100) : new ConcurrentScoreMap(); + final ScoreMap topicNavigator = theSearch.getTopics(); // as there is currently no index interaction in getTopics(), we can use it by default + final StringBuilder refstr = new StringBuilder(6000); final Iterator navigatorIterator = topicNavigator.keys(false); int i = 0; diff --git a/source/net/yacy/search/query/SearchEvent.java b/source/net/yacy/search/query/SearchEvent.java index 579267968..a291451d4 100644 --- a/source/net/yacy/search/query/SearchEvent.java +++ b/source/net/yacy/search/query/SearchEvent.java @@ -1329,10 +1329,32 @@ public final class SearchEvent { public long getSnippetComputationTime() { return this.snippetComputationAllTime; } - - public ScoreMap getTopicNavigator(final int count ) { + + /** + * Get topics in a ScoreMap if config allows topic navigator + * (the topics are filtered by badwords, stopwords and words included in the query) + * + * @param count max number of topics returned + * @return ScoreMap with max number of topics or null if + */ + public ScoreMap getTopicNavigator(final int count) { if (this.topicNavigatorCount > 0 && count >= 0) { //topicNavigatorCount set during init, 0=no nav - return this.getTopics(count != 0 ? count : this.topicNavigatorCount, 500); + if (!this.ref.sizeSmaller(2)) { + ScoreMap result; + int ic = count != 0 ? count : this.topicNavigatorCount; + + if (this.ref.size() <= ic) { // size matches return map directly + result = this.getTopics(/*ic, 500*/); + } else { // collect top most count topics + result = new ConcurrentScoreMap(); + Iterator it = this.getTopics(/*ic, 500*/).keys(false); + while (ic-- > 0 && it.hasNext()) { + String word = it.next(); + result.set(word, this.ref.get(word)); + } + } + return result; + } } return null; } @@ -1836,21 +1858,20 @@ public final class SearchEvent { } /** - * create a list of words that had been computed by statistics over all + * Return the list of words that had been computed by statistics over all * words that appeared in the url or the description of all urls * - * @param maxcount max number of topwords to return - * @param maxtime max time allowed to use - * @return + * @return ScoreMap */ - public ScoreMap getTopics(final int maxcount, final long maxtime) { + public ScoreMap getTopics(/* final int maxcount, final long maxtime */) { + /* ---------------------------------- start of rem (2016-09-03) + // TODO: result map is not used currently, verify if it should and use or delete this code block + // TODO: as it is not used now - in favour of performance this code block is rem'ed (2016-09-03) + final ScoreMap result = new ConcurrentScoreMap(); if ( this.ref.sizeSmaller(2) ) { this.ref.clear(); // navigators with one entry are not useful } - /* ---------------------------------- start of rem (2016-09-03) - // TODO: result map is not used currently, verify if it should and use or delete this code block - // TODO: as it is not used now - in favour of performance this code block is rem'ed (2016-09-03) final Map counts = new HashMap(); final Iterator i = this.ref.keys(false); @@ -1883,6 +1904,11 @@ public final class SearchEvent { private final static Pattern lettermatch = Pattern.compile("[a-z]+"); + /** + * Collects topics in a ScoreMap for words not included in the query words. + * Words are also filtered by badword blacklist and stopword list. + * @param words + */ public void addTopic(final String[] words) { String word; for ( final String w : words ) { @@ -1899,6 +1925,10 @@ public final class SearchEvent { } } + /** + * Ad title words to this searchEvent's topic score map + * @param resultEntry + */ protected void addTopics(final URIMetadataNode resultEntry) { // take out relevant information for reference computation if ((resultEntry.url() == null) || (resultEntry.title() == null)) return; From e310ec5f702154c215139a0e0fc35db2155a3637 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 6 Sep 2016 00:05:59 +0200 Subject: [PATCH 18/28] fix posInText ranking calculation to score 0 on no position info + fix Word posInText calc in Tokenizer to start with 1 + test case --- source/net/yacy/document/Tokenizer.java | 2 +- .../yacy/search/ranking/ReferenceOrder.java | 4 +- .../java/net/yacy/document/TokenizerTest.java | 39 +++++++++++++++++++ 3 files changed, 42 insertions(+), 3 deletions(-) create mode 100644 test/java/net/yacy/document/TokenizerTest.java diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index ff2e94bff..ca5591795 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -170,7 +170,7 @@ public class Tokenizer { wsp.inc(); } else { // word does not yet exist, create new word entry - wordHandle = wordHandleCount++; + wordHandle = ++wordHandleCount; // let start pos with 1 wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); wsp.flags = this.RESULT_FLAGS.clone(); this.words.put(word.toLowerCase(), wsp); diff --git a/source/net/yacy/search/ranking/ReferenceOrder.java b/source/net/yacy/search/ranking/ReferenceOrder.java index fd6cf5be8..7e7376cbc 100644 --- a/source/net/yacy/search/ranking/ReferenceOrder.java +++ b/source/net/yacy/search/ranking/ReferenceOrder.java @@ -228,13 +228,13 @@ public class ReferenceOrder { assert this.ranking != null; final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency); //System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf); - final int maxmaxpos = this.max.maxposition(); + final int maxmaxpos = this.max.maxposition(); // returns Integer.MIN_VALUE if positions empty final int minminpos = this.min.minposition(); final long r = ((256 - DigestURL.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength) + ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps) + ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength) - + ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext) + + ((maxmaxpos == minminpos || maxmaxpos < 0) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext) + ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase) + ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase) + ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance) diff --git a/test/java/net/yacy/document/TokenizerTest.java b/test/java/net/yacy/document/TokenizerTest.java new file mode 100644 index 000000000..e54807105 --- /dev/null +++ b/test/java/net/yacy/document/TokenizerTest.java @@ -0,0 +1,39 @@ + +package net.yacy.document; + +import java.net.MalformedURLException; +import java.util.Map; +import net.yacy.cora.document.WordCache; +import net.yacy.kelondro.data.word.Word; +import org.junit.Test; +import static org.junit.Assert.*; + + +public class TokenizerTest { + + /** + * Test of words method, of class Tokenizer. + */ + @Test + public void testWords() throws MalformedURLException { + // pos = 1 2 3 4 5 6 7 8 9 10 // 1-letter words don't count + String text = "One word is not a sentence because words are just words."; + WordCache meaningLib = new WordCache(null); + boolean doAutotagging = false; + VocabularyScraper scraper = null; + + Tokenizer t = new Tokenizer(null, text, meaningLib, doAutotagging, scraper); + + Map words = t.words; + + // test extracted word information (position) + Word w = words.get("word"); + assertEquals("position of 'word' ", 2, w.posInText); + assertEquals("occurence of 'word' ", 1, w.occurrences()); + + w = words.get("words"); + assertEquals("position of 'words' ", 7, w.posInText); + assertEquals("occurence of 'words' ", 2, w.occurrences()); + } + +} From 120bf7e6e20c8b0cab4bcd656a626510d149a105 Mon Sep 17 00:00:00 2001 From: reger Date: Tue, 6 Sep 2016 03:18:02 +0200 Subject: [PATCH 19/28] implemented RWI WordReference to return the word position value (was always left empty) This is needed and enables existing word position ranking for RWI. The upcoming concurrency issue in word position min/max calculation were eliminated by iterator.hasHext check before next() access. --- .../kelondro/data/word/WordReferenceRow.java | 9 ++++++++- .../kelondro/data/word/WordReferenceVars.java | 17 ++++++++++++----- .../yacy/kelondro/rwi/AbstractReference.java | 16 ++++++++++++++-- 3 files changed, 34 insertions(+), 8 deletions(-) diff --git a/source/net/yacy/kelondro/data/word/WordReferenceRow.java b/source/net/yacy/kelondro/data/word/WordReferenceRow.java index 5575c06d7..7ec32879c 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceRow.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceRow.java @@ -252,9 +252,16 @@ public final class WordReferenceRow extends AbstractReference implements WordRef return (0xff & this.entry.getColByte(col_hitcount)); } + /** + * First position of word in text + * @return Collection with one element + */ @Override public Collection positions() { - return new ArrayList(0); + int pos = (int) this.entry.getColLong(col_posintext); + ArrayList arr = new ArrayList(1); + arr.add(pos); + return arr; } @Override diff --git a/source/net/yacy/kelondro/data/word/WordReferenceVars.java b/source/net/yacy/kelondro/data/word/WordReferenceVars.java index af30c4db7..1cec8ad4e 100644 --- a/source/net/yacy/kelondro/data/word/WordReferenceVars.java +++ b/source/net/yacy/kelondro/data/word/WordReferenceVars.java @@ -60,10 +60,11 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc public final byte[] urlHash; private String hostHash = null; private final char type; - private int hitcount, llocal, lother, phrasesintext, - posinphrase, posofphrase, - urlcomps, urllength, - wordsintext, wordsintitle; + private int hitcount, // how often appears this word in the text + llocal, lother, phrasesintext, + posinphrase, posofphrase, + urlcomps, urllength, + wordsintext, wordsintitle; private int virtualAge; private final Queue positions; private double termFrequency; @@ -210,6 +211,10 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc return this.type; } + /** + * How often appears this word in the text + * @return + */ @Override public int hitcount() { return this.hitcount; @@ -259,7 +264,9 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc this.hitcount, // how often appears this word in the text this.wordsintext, // total number of words this.phrasesintext, // total number of phrases - this.positions.isEmpty() ? 1 : this.positions.iterator().next(), // position of word in all words + + // TODO: positon 1 on empty positions may give high ranking scores for unknown pos (needs to be checked if 0 would be appropriate) + this.positions.isEmpty() ? -1 : this.positions.iterator().next(), // position of word in all words this.posinphrase, // position of word in its phrase this.posofphrase, // number of the phrase where word appears this.lastModified, // last-modified time of the document where word appears diff --git a/source/net/yacy/kelondro/rwi/AbstractReference.java b/source/net/yacy/kelondro/rwi/AbstractReference.java index 52c3193b0..e1097da85 100644 --- a/source/net/yacy/kelondro/rwi/AbstractReference.java +++ b/source/net/yacy/kelondro/rwi/AbstractReference.java @@ -63,9 +63,17 @@ public abstract class AbstractReference implements Reference { private static int max(Collection a) { if (a == null || a.isEmpty()) return Integer.MIN_VALUE; Iterator i = a.iterator(); + /* + expirienced concurrency issue with this short cut 2016-09-06 + on i.next w/o test of hasNext before + java.util.NoSuchElementException at java.util.concurrent.LinkedBlockingQueue$Itr.next(LinkedBlockingQueue.java:828) + if (a.size() == 1) return i.next(); if (a.size() == 2) return Math.max(i.next(), i.next()); int r = i.next(); + */ + int r = Integer.MIN_VALUE; + int s; while (i.hasNext()) { s = i.next(); @@ -77,9 +85,12 @@ public abstract class AbstractReference implements Reference { private static int min(Collection a) { if (a == null || a.isEmpty()) return Integer.MAX_VALUE; Iterator i = a.iterator(); + /* concurrency issue (see max()) if (a.size() == 1) return i.next(); if (a.size() == 2) return Math.min(i.next(), i.next()); int r = i.next(); + */ + int r = Integer.MAX_VALUE; int s; while (i.hasNext()) { s = i.next(); @@ -103,10 +114,11 @@ public abstract class AbstractReference implements Reference { if (positions().size() < 2) return 0; int d = 0; Iterator i = positions().iterator(); - int s0 = i.next(), s1; + // int s0 = i.next(), s1; // concurrency issue see max() + int s0 = -1, s1; while (i.hasNext()) { s1 = i.next(); - d += Math.abs(s0 - s1); + if (s0 > 0) d += Math.abs(s0 - s1); s0 = s1; } return d / (positions().size() - 1); From 9934f546bbc67650c25f74b8f65830e253a3ba41 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 06:56:51 +0200 Subject: [PATCH 20/28] added default fl to solr query, removed large texts retrieval and changed snippet to description tag if no other description is available --- .../responsewriter/YJsonResponseWriter.java | 20 +++++-------------- .../yacy/http/servlets/SolrSelectServlet.java | 14 +++++++++++-- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index 3588a336c..eda6b71fc 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -54,6 +54,9 @@ import org.apache.solr.search.SolrIndexSearcher; /** * write the opensearch result in YaCys special way to include as much as in opensearch is included. * This will also include YaCy facets. + * + * example: + * http://localhost:8090/solr/select?hl=false&wt=yjson&facet=true&facet.mincount=1&facet.field=host_s&facet.field=url_file_ext_s&facet.field=url_protocol_s&facet.field=author_sxt&facet.field=collection_sxt&start=0&rows=10&query=www */ public class YJsonResponseWriter implements QueryResponseWriter { @@ -135,7 +138,6 @@ public class YJsonResponseWriter implements QueryResponseWriter { Document doc = searcher.doc(id, OpensearchResponseWriter.SOLR_FIELDS); List fields = doc.getFields(); int fieldc = fields.size(); - List texts = new ArrayList(); MultiProtocolURL url = null; String urlhash = null; List descriptions = new ArrayList(); @@ -166,13 +168,11 @@ public class YJsonResponseWriter implements QueryResponseWriter { } if (CollectionSchema.title.getSolrFieldName().equals(fieldName)) { title = value.stringValue(); - texts.add(title); continue; } if (CollectionSchema.description_txt.getSolrFieldName().equals(fieldName)) { String description = value.stringValue(); descriptions.add(description); - texts.add(description); continue; } if (CollectionSchema.id.getSolrFieldName().equals(fieldName)) { @@ -197,25 +197,15 @@ public class YJsonResponseWriter implements QueryResponseWriter { solitaireTag(writer, "sizename", sizemb > 0 ? (Integer.toString(sizemb) + " mbyte") : sizekb > 0 ? (Integer.toString(sizekb) + " kbyte") : (Integer.toString(size) + " byte")); continue; } - if (CollectionSchema.text_t.getSolrFieldName().equals(fieldName)) { - texts.add(value.stringValue()); - continue; - } - if (CollectionSchema.h1_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h2_txt.getSolrFieldName().equals(fieldName) || - CollectionSchema.h3_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h4_txt.getSolrFieldName().equals(fieldName) || - CollectionSchema.h5_txt.getSolrFieldName().equals(fieldName) || CollectionSchema.h6_txt.getSolrFieldName().equals(fieldName)) { - // because these are multi-valued fields, there can be several of each - texts.add(value.stringValue()); - continue; - } //missing: "code","faviconCode" } // compute snippet from texts solitaireTag(writer, "path", path.toString()); - solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title); + solitaireTag(writer, "title", title.length() == 0 ? path.toString() : title); LinkedHashSet snippet = urlhash == null ? null : snippets.get(urlhash); + if (snippet == null) {snippet = new LinkedHashSet<>(); snippet.addAll(descriptions);} OpensearchResponseWriter.removeSubsumedTitle(snippet, title); writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet))); writer.write("\"\n}\n"); if (i < responseCount - 1) { diff --git a/source/net/yacy/http/servlets/SolrSelectServlet.java b/source/net/yacy/http/servlets/SolrSelectServlet.java index e93eb6891..5d440588c 100644 --- a/source/net/yacy/http/servlets/SolrSelectServlet.java +++ b/source/net/yacy/http/servlets/SolrSelectServlet.java @@ -124,7 +124,7 @@ public class SolrSelectServlet extends HttpServlet { Switchboard sb = Switchboard.getSwitchboard(); // TODO: isUserInRole needs a login to jetty container (not done automatically on admin from localhost) - boolean authenticated = hrequest.isUserInRole(UserDB.AccessRight.ADMIN_RIGHT.toString());; + boolean authenticated = hrequest.isUserInRole(UserDB.AccessRight.ADMIN_RIGHT.toString()); // count remote searches if this was part of a p2p search if (mmsp.getMap().containsKey("partitions")) { @@ -190,11 +190,21 @@ public class SolrSelectServlet extends HttpServlet { if ((responseWriter instanceof YJsonResponseWriter || responseWriter instanceof OpensearchResponseWriter) && "true".equals(mmsp.get("hl", "true"))) { // add options for snippet generation if (!mmsp.getMap().containsKey("hl.q")) mmsp.getMap().put("hl.q", new String[]{q}); - if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.description_txt + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()}); + if (!mmsp.getMap().containsKey("hl.fl")) mmsp.getMap().put("hl.fl", new String[]{CollectionSchema.description_txt.getSolrFieldName() + "," + CollectionSchema.h4_txt.getSolrFieldName() + "," + CollectionSchema.h3_txt.getSolrFieldName() + "," + CollectionSchema.h2_txt.getSolrFieldName() + "," + CollectionSchema.h1_txt.getSolrFieldName() + "," + CollectionSchema.text_t.getSolrFieldName()}); if (!mmsp.getMap().containsKey("hl.alternateField")) mmsp.getMap().put("hl.alternateField", new String[]{CollectionSchema.description_txt.getSolrFieldName()}); if (!mmsp.getMap().containsKey("hl.simple.pre")) mmsp.getMap().put("hl.simple.pre", new String[]{""}); if (!mmsp.getMap().containsKey("hl.simple.post")) mmsp.getMap().put("hl.simple.post", new String[]{""}); if (!mmsp.getMap().containsKey("hl.fragsize")) mmsp.getMap().put("hl.fragsize", new String[]{Integer.toString(SearchEvent.SNIPPET_MAX_LENGTH)}); + if (!mmsp.getMap().containsKey("fl")) mmsp.getMap().put("fl", new String[]{ + CollectionSchema.sku.getSolrFieldName() + "," + + CollectionSchema.title + "," + + CollectionSchema.description_txt.getSolrFieldName() + "," + + CollectionSchema.id.getSolrFieldName() + "," + + CollectionSchema.url_paths_sxt.getSolrFieldName() + "," + + CollectionSchema.last_modified.getSolrFieldName() + "," + + CollectionSchema.size_i.getSolrFieldName() + "," + + CollectionSchema.url_protocol_s.getSolrFieldName() + "," + + CollectionSchema.url_file_ext_s.getSolrFieldName()}); } // get the embedded connector From db6d8fc197926f4627d8e0bd1c8393bbb93c6f58 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 07:44:38 +0200 Subject: [PATCH 21/28] fix for bad json --- .../solr/responsewriter/OpensearchResponseWriter.java | 2 +- .../federate/solr/responsewriter/YJsonResponseWriter.java | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java index 7d39e54bc..d6f728ca3 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/OpensearchResponseWriter.java @@ -349,7 +349,7 @@ public class OpensearchResponseWriter implements QueryResponseWriter { for (String s: snippets) { if ((l == null || s.length() > l.length()) && s.indexOf(' ') > 0) l = s; } - return l; + return l.replaceAll("\"", "'"); } public static void openTag(final Writer writer, final String tag) throws IOException { diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index eda6b71fc..b3f5abccd 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -203,7 +203,7 @@ public class YJsonResponseWriter implements QueryResponseWriter { // compute snippet from texts solitaireTag(writer, "path", path.toString()); - solitaireTag(writer, "title", title.length() == 0 ? path.toString() : title); + solitaireTag(writer, "title", title.length() == 0 ? path.toString() : title.replaceAll("\"", "'")); LinkedHashSet snippet = urlhash == null ? null : snippets.get(urlhash); if (snippet == null) {snippet = new LinkedHashSet<>(); snippet.addAll(descriptions);} OpensearchResponseWriter.removeSubsumedTitle(snippet, title); @@ -293,7 +293,8 @@ public class YJsonResponseWriter implements QueryResponseWriter { writer.write('"'); writer.write(tagname); writer.write("\":\""); writer.write(serverObjects.toJSON(value)); writer.write("\","); writer.write('\n'); } - private static void facetEntry(final Writer writer, final String modifier, final String propname, String value) throws IOException { + private static void facetEntry(final Writer writer, final String modifier, String propname, String value) throws IOException { + propname = propname.replaceAll("\"", "'"); writer.write("{\"name\": \""); writer.write(propname); writer.write("\", \"count\": \""); writer.write(value); writer.write("\", \"modifier\": \""); writer.write(modifier); writer.write("%3A"); writer.write(propname); From 8681cee3f30f5f48a7401e1fe8d25efa1ac100ea Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 09:00:35 +0200 Subject: [PATCH 22/28] fix for bad comma --- .../responsewriter/YJsonResponseWriter.java | 23 ++++++++++++++----- 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index b3f5abccd..4c5ac0efd 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -230,16 +230,20 @@ public class YJsonResponseWriter implements QueryResponseWriter { @SuppressWarnings("unchecked") NamedList collections = facetFields == null ? null : (NamedList) facetFields.get(CollectionSchema.collection_sxt.getSolrFieldName()); + int facetcount = 0; if (domains != null) { + writer.write(facetcount > 0 ? ",\n" : "\n"); writer.write("{\"facetname\":\"domains\",\"displayname\":\"Provider\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); for (int i = 0; i < domains.size(); i++) { facetEntry(writer, "site", domains.getName(i), Integer.toString(domains.getVal(i))); if (i < domains.size() - 1) writer.write(','); writer.write("\n"); } - writer.write("]},\n".toCharArray()); + writer.write("]}".toCharArray()); + facetcount++; } if (filetypes != null) { + writer.write(facetcount > 0 ? ",\n" : "\n"); writer.write("{\"facetname\":\"filetypes\",\"displayname\":\"Filetypes\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); List> l = new ArrayList>(); for (Map.Entry e: filetypes) { @@ -252,36 +256,43 @@ public class YJsonResponseWriter implements QueryResponseWriter { if (i < l.size() - 1) writer.write(','); writer.write("\n"); } - writer.write("]},\n".toCharArray()); + writer.write("]}".toCharArray()); + facetcount++; } if (protocols != null) { + writer.write(facetcount > 0 ? ",\n" : "\n"); writer.write("{\"facetname\":\"protocols\",\"displayname\":\"Protocol\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); for (int i = 0; i < protocols.size(); i++) { facetEntry(writer, "protocol", protocols.getName(i), Integer.toString(protocols.getVal(i))); if (i < protocols.size() - 1) writer.write(','); writer.write("\n"); } - writer.write("]},\n".toCharArray()); + writer.write("]}".toCharArray()); + facetcount++; } if (authors != null) { + writer.write(facetcount > 0 ? ",\n" : "\n"); writer.write("{\"facetname\":\"authors\",\"displayname\":\"Authors\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); for (int i = 0; i < authors.size(); i++) { facetEntry(writer, "author", authors.getName(i), Integer.toString(authors.getVal(i))); if (i < authors.size() - 1) writer.write(','); writer.write("\n"); } - writer.write("]},\n".toCharArray()); + writer.write("]}".toCharArray()); + facetcount++; } if (collections != null) { + writer.write(facetcount > 0 ? ",\n" : "\n"); writer.write("{\"facetname\":\"collections\",\"displayname\":\"Collections\",\"type\":\"String\",\"min\":\"0\",\"max\":\"0\",\"mean\":\"0\",\"elements\":[\n".toCharArray()); for (int i = 0; i < collections.size(); i++) { facetEntry(writer, "collection", collections.getName(i), Integer.toString(collections.getVal(i))); if (i < collections.size() - 1) writer.write(','); writer.write("\n"); } - writer.write("]},\n".toCharArray()); + writer.write("]}".toCharArray()); + facetcount++; } - writer.write("]}]}\n".toCharArray()); + writer.write("\n]}]}\n".toCharArray()); if (jsonp != null) { writer.write("])".toCharArray()); From 5060f9fee94064a1571578d648f832bcda503781 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 09:05:39 +0200 Subject: [PATCH 23/28] fix for too long snippets --- .../federate/solr/responsewriter/YJsonResponseWriter.java | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index 4c5ac0efd..94b8bcbf4 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -207,7 +207,13 @@ public class YJsonResponseWriter implements QueryResponseWriter { LinkedHashSet snippet = urlhash == null ? null : snippets.get(urlhash); if (snippet == null) {snippet = new LinkedHashSet<>(); snippet.addAll(descriptions);} OpensearchResponseWriter.removeSubsumedTitle(snippet, title); - writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet))); writer.write("\"\n}\n"); + String snippetstring = snippet == null || snippet.size() == 0 ? (descriptions.size() > 0 ? descriptions.get(0) : "") : OpensearchResponseWriter.getLargestSnippet(snippet); + if (snippetstring.length() > 140) { + snippetstring = snippetstring.substring(0, 140); + int sp = snippetstring.lastIndexOf(' '); + if (sp >= 0) snippetstring = snippetstring.substring(0, sp) + " ..."; else snippetstring = snippetstring + "..."; + } + writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippetstring)); writer.write("\"\n}\n"); if (i < responseCount - 1) { writer.write(",\n".toCharArray()); } From 6139bd85a8fb54fd903fcc8479ea7b6c075908cd Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 17:19:54 +0200 Subject: [PATCH 24/28] fix for broken facet names --- .../federate/solr/responsewriter/YJsonResponseWriter.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index 94b8bcbf4..da59055bb 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -310,8 +310,9 @@ public class YJsonResponseWriter implements QueryResponseWriter { writer.write('"'); writer.write(tagname); writer.write("\":\""); writer.write(serverObjects.toJSON(value)); writer.write("\","); writer.write('\n'); } - private static void facetEntry(final Writer writer, final String modifier, String propname, String value) throws IOException { - propname = propname.replaceAll("\"", "'"); + private static void facetEntry(final Writer writer, String modifier, String propname, String value) throws IOException { + modifier = modifier.replaceAll("\"", "'").trim(); + propname = propname.replaceAll("\"", "'").trim(); writer.write("{\"name\": \""); writer.write(propname); writer.write("\", \"count\": \""); writer.write(value); writer.write("\", \"modifier\": \""); writer.write(modifier); writer.write("%3A"); writer.write(propname); From c716648c7810e681236a43cc19e66e9b9a53c79e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 18:45:29 +0200 Subject: [PATCH 25/28] enhanced json encoding of strings --- htroot/yacy/seedlist.java | 7 +++-- .../responsewriter/YJsonResponseWriter.java | 16 +++++----- source/net/yacy/server/serverObjects.java | 30 ++++--------------- 3 files changed, 17 insertions(+), 36 deletions(-) diff --git a/htroot/yacy/seedlist.java b/htroot/yacy/seedlist.java index d5d88a5cc..dba529837 100644 --- a/htroot/yacy/seedlist.java +++ b/htroot/yacy/seedlist.java @@ -24,6 +24,7 @@ import java.util.Map; import java.util.Set; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.JSONObject; import net.yacy.peers.Seed; import net.yacy.search.Switchboard; import net.yacy.server.serverCore; @@ -91,14 +92,14 @@ public final class seedlist { Set ips = seed.getIPs(); if (ips == null || ips.size() == 0) continue; prop.putJSON("peers_" + count + "_map_0_k", Seed.HASH); - prop.put("peers_" + count + "_map_0_v", '"' + serverObjects.toJSON(seed.hash) + '"'); + prop.put("peers_" + count + "_map_0_v", JSONObject.quote(seed.hash)); prop.put("peers_" + count + "_map_0_c", 1); Map map = seed.getMap(); int c = 1; if (!addressonly) { for (Map.Entry m: map.entrySet()) { prop.putJSON("peers_" + count + "_map_" + c + "_k", m.getKey()); - prop.put("peers_" + count + "_map_" + c + "_v", '"' + serverObjects.toJSON(m.getValue()) + '"'); + prop.put("peers_" + count + "_map_" + c + "_v", JSONObject.quote(m.getValue())); prop.put("peers_" + count + "_map_" + c + "_c", 1); c++; } @@ -106,7 +107,7 @@ public final class seedlist { // construct a list of ips StringBuilder a = new StringBuilder(); a.append('['); - for (String ip: ips) a.append('"').append(serverObjects.toJSON(seed.getPublicAddress(ip))).append('"').append(','); + for (String ip: ips) a.append(JSONObject.quote(seed.getPublicAddress(ip))).append(','); a.setCharAt(a.length()-1, ']'); prop.putJSON("peers_" + count + "_map_" + c + "_k", "Address"); prop.put("peers_" + count + "_map_" + c + "_v", a.toString()); diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index da59055bb..d32254266 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -34,9 +34,9 @@ import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter.ResHead; import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.util.JSONObject; import net.yacy.data.URLLicense; import net.yacy.search.schema.CollectionSchema; -import net.yacy.server.serverObjects; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexableField; @@ -213,7 +213,7 @@ public class YJsonResponseWriter implements QueryResponseWriter { int sp = snippetstring.lastIndexOf(' '); if (sp >= 0) snippetstring = snippetstring.substring(0, sp) + " ..."; else snippetstring = snippetstring + "..."; } - writer.write("\"description\":\""); writer.write(serverObjects.toJSON(snippetstring)); writer.write("\"\n}\n"); + writer.write("\"description\":"); writer.write(JSONObject.quote(snippetstring)); writer.write("\n}\n"); if (i < responseCount - 1) { writer.write(",\n".toCharArray()); } @@ -307,16 +307,16 @@ public class YJsonResponseWriter implements QueryResponseWriter { public static void solitaireTag(final Writer writer, final String tagname, String value) throws IOException { if (value == null) return; - writer.write('"'); writer.write(tagname); writer.write("\":\""); writer.write(serverObjects.toJSON(value)); writer.write("\","); writer.write('\n'); + writer.write('"'); writer.write(tagname); writer.write("\":"); writer.write(JSONObject.quote(value)); writer.write(','); writer.write('\n'); } - private static void facetEntry(final Writer writer, String modifier, String propname, String value) throws IOException { + private static void facetEntry(final Writer writer, String modifier, String propname, final String value) throws IOException { modifier = modifier.replaceAll("\"", "'").trim(); propname = propname.replaceAll("\"", "'").trim(); - writer.write("{\"name\": \""); writer.write(propname); - writer.write("\", \"count\": \""); writer.write(value); - writer.write("\", \"modifier\": \""); writer.write(modifier); writer.write("%3A"); writer.write(propname); - writer.write("\"}"); + writer.write("{\"name\":"); writer.write(JSONObject.quote(propname)); + writer.write(",\"count\":"); writer.write(JSONObject.quote(value.replaceAll("\"", "'").trim())); + writer.write(",\"modifier\":"); writer.write(JSONObject.quote(modifier+"%3A"+propname)); + writer.write("}"); } } /** diff --git a/source/net/yacy/server/serverObjects.java b/source/net/yacy/server/serverObjects.java index d5be88e51..1e2480056 100644 --- a/source/net/yacy/server/serverObjects.java +++ b/source/net/yacy/server/serverObjects.java @@ -63,6 +63,7 @@ import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader.FileType; +import net.yacy.cora.util.JSONObject; import net.yacy.document.parser.html.CharacterCoding; import net.yacy.kelondro.util.Formatter; import net.yacy.search.Switchboard; @@ -81,12 +82,6 @@ public class serverObjects implements Serializable, Cloneable { public final static String ADMIN_AUTHENTICATE_MSG = "admin log-in. If you don't know the password, set it with {yacyhome}/bin/passwd.sh {newpassword}"; private final static Pattern patternNewline = Pattern.compile("\n"); - private final static Pattern patternDoublequote = Pattern.compile("\""); - private final static Pattern patternSlash = Pattern.compile("/"); - private final static Pattern patternB = Pattern.compile("\b"); - private final static Pattern patternF = Pattern.compile("\f"); - private final static Pattern patternR = Pattern.compile("\r"); - private final static Pattern patternT = Pattern.compile("\t"); private boolean localized = true; @@ -284,20 +279,10 @@ public class serverObjects implements Serializable, Cloneable { * @param key key name as String. * @param value a String that will be reencoded for JSON output. */ - public void putJSON(final String key, final String value) { - put(key, toJSON(value)); - } - - public static String toJSON(String value) { - // value = value.replaceAll("\\", "\\\\"); - value = patternDoublequote.matcher(value).replaceAll("'"); - value = patternSlash.matcher(value).replaceAll("\\/"); - value = patternB.matcher(value).replaceAll("\\b"); - value = patternF.matcher(value).replaceAll("\\f"); - value = patternNewline.matcher(value).replaceAll("\\r"); - value = patternR.matcher(value).replaceAll("\\r"); - value = patternT.matcher(value).replaceAll("\\t"); - return value; + public void putJSON(final String key, String value) { + value = JSONObject.quote(value); + value = value.substring(1, value.length() - 1); + put(key, value); } /** @@ -558,9 +543,4 @@ public class serverObjects implements Serializable, Cloneable { return this.map; } - public static void main(final String[] args) { - final String v = "ein \"zitat\""; - System.out.println(toJSON(v)); - } - } From 5e165a8150df2d1a603732f451aa61392054dde5 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 6 Sep 2016 18:46:24 +0200 Subject: [PATCH 26/28] removed unused imports --- htroot/Autocrawl_p.java | 1 - htroot/WebStructurePicture_p.java | 1 - htroot/yacy/search.java | 1 - source/net/yacy/cora/lod/vocabulary/Tagging.java | 1 - source/net/yacy/cora/util/Html2Image.java | 1 - source/net/yacy/crawler/data/CrawlQueues.java | 2 -- source/net/yacy/crawler/retrieval/Response.java | 1 - source/net/yacy/document/TextParser.java | 1 - source/net/yacy/document/parser/sitemapParser.java | 1 - source/net/yacy/document/parser/zipParser.java | 1 - source/net/yacy/peers/Network.java | 2 -- source/net/yacy/repository/Blacklist.java | 1 - test/java/net/yacy/data/wiki/WikiCodeTest.java | 1 - 13 files changed, 15 deletions(-) diff --git a/htroot/Autocrawl_p.java b/htroot/Autocrawl_p.java index 59aa6bc65..27c629a2d 100644 --- a/htroot/Autocrawl_p.java +++ b/htroot/Autocrawl_p.java @@ -1,5 +1,4 @@ import net.yacy.cora.protocol.RequestHeader; -import net.yacy.crawler.data.CrawlProfile.CrawlAttribute; import net.yacy.search.Switchboard; import net.yacy.search.SwitchboardConstants; import net.yacy.server.serverObjects; diff --git a/htroot/WebStructurePicture_p.java b/htroot/WebStructurePicture_p.java index ff548544c..12bf1affa 100644 --- a/htroot/WebStructurePicture_p.java +++ b/htroot/WebStructurePicture_p.java @@ -39,7 +39,6 @@ import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.sorting.ClusteredScoreMap; import net.yacy.cora.util.CommonPattern; -import net.yacy.cora.util.ConcurrentLog; import net.yacy.peers.graphics.WebStructureGraph; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index c208e4282..a2aa9e11c 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -45,7 +45,6 @@ import net.yacy.cora.lod.vocabulary.Tagging; import net.yacy.cora.protocol.Domains; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; -import net.yacy.cora.sorting.ConcurrentScoreMap; import net.yacy.cora.sorting.ScoreMap; import net.yacy.cora.sorting.WeakPriorityBlockingQueue; import net.yacy.cora.storage.HandleSet; diff --git a/source/net/yacy/cora/lod/vocabulary/Tagging.java b/source/net/yacy/cora/lod/vocabulary/Tagging.java index 5281c900b..b6afab028 100644 --- a/source/net/yacy/cora/lod/vocabulary/Tagging.java +++ b/source/net/yacy/cora/lod/vocabulary/Tagging.java @@ -37,7 +37,6 @@ import java.util.concurrent.BlockingQueue; import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Pattern; -import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.geo.GeoLocation; import net.yacy.cora.geo.Locations; import net.yacy.cora.storage.Files; diff --git a/source/net/yacy/cora/util/Html2Image.java b/source/net/yacy/cora/util/Html2Image.java index e6fbad6a2..139d27b25 100644 --- a/source/net/yacy/cora/util/Html2Image.java +++ b/source/net/yacy/cora/util/Html2Image.java @@ -47,7 +47,6 @@ import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.OS; import org.apache.pdfbox.pdmodel.PDDocument; -import org.apache.pdfbox.pdmodel.PDPage; import org.apache.pdfbox.rendering.ImageType; import org.apache.pdfbox.rendering.PDFRenderer; diff --git a/source/net/yacy/crawler/data/CrawlQueues.java b/source/net/yacy/crawler/data/CrawlQueues.java index 4593c419d..4e272204a 100644 --- a/source/net/yacy/crawler/data/CrawlQueues.java +++ b/source/net/yacy/crawler/data/CrawlQueues.java @@ -29,7 +29,6 @@ package net.yacy.crawler.data; import java.io.File; import java.io.IOException; import java.net.MalformedURLException; -import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Date; import java.util.HashMap; @@ -51,7 +50,6 @@ import net.yacy.cora.document.feed.Hit; import net.yacy.cora.document.feed.RSSFeed; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.solr.FailCategory; -import net.yacy.cora.federate.solr.connector.SolrConnector; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.order.Base64Order; import net.yacy.cora.protocol.ConnectionInfo; diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index acbcf0641..8a736946a 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -32,7 +32,6 @@ import java.util.Date; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; -import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.HeaderFramework; diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index d0b3d286a..f483a0f2c 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -52,7 +52,6 @@ import net.yacy.document.parser.ooxmlParser; import net.yacy.document.parser.pdfParser; import net.yacy.document.parser.pptParser; import net.yacy.document.parser.psParser; -import net.yacy.document.parser.rdfParser; import net.yacy.document.parser.rssParser; import net.yacy.document.parser.rtfParser; import net.yacy.document.parser.sevenzipParser; diff --git a/source/net/yacy/document/parser/sitemapParser.java b/source/net/yacy/document/parser/sitemapParser.java index 65b1d07b6..9ea66ddb5 100644 --- a/source/net/yacy/document/parser/sitemapParser.java +++ b/source/net/yacy/document/parser/sitemapParser.java @@ -31,7 +31,6 @@ import java.net.MalformedURLException; import java.text.ParseException; import java.util.ArrayList; import java.util.Date; -import java.util.LinkedHashMap; import java.util.List; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index ab5f800d5..a1a367344 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -31,7 +31,6 @@ import java.util.Date; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; -import net.yacy.cora.document.id.AnchorURL; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.document.AbstractParser; diff --git a/source/net/yacy/peers/Network.java b/source/net/yacy/peers/Network.java index ac3cf49ba..79ba206b2 100644 --- a/source/net/yacy/peers/Network.java +++ b/source/net/yacy/peers/Network.java @@ -48,8 +48,6 @@ import java.util.List; import java.util.Map; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import java.util.concurrent.Semaphore; - import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.feed.RSSFeed; diff --git a/source/net/yacy/repository/Blacklist.java b/source/net/yacy/repository/Blacklist.java index ce72e689e..116b29934 100644 --- a/source/net/yacy/repository/Blacklist.java +++ b/source/net/yacy/repository/Blacklist.java @@ -33,7 +33,6 @@ import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.PrintWriter; -import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.List; diff --git a/test/java/net/yacy/data/wiki/WikiCodeTest.java b/test/java/net/yacy/data/wiki/WikiCodeTest.java index ca110fb9a..ef25bf7ee 100644 --- a/test/java/net/yacy/data/wiki/WikiCodeTest.java +++ b/test/java/net/yacy/data/wiki/WikiCodeTest.java @@ -1,6 +1,5 @@ package net.yacy.data.wiki; -import java.io.BufferedReader; import org.junit.Test; import static org.junit.Assert.*; From 272cdd496a33a4c282443de7e70d4653613b96be Mon Sep 17 00:00:00 2001 From: reger Date: Wed, 7 Sep 2016 02:16:16 +0200 Subject: [PATCH 27/28] reactivate sentence counter in WordTokenizer for phrasepos ranking, by counting punktuation (delivered as 1 char word) again. --- source/net/yacy/document/Tokenizer.java | 66 +++++++++---------- source/net/yacy/document/WordTokenizer.java | 10 ++- .../net/yacy/document/WordTokenizerTest.java | 8 ++- 3 files changed, 47 insertions(+), 37 deletions(-) diff --git a/source/net/yacy/document/Tokenizer.java b/source/net/yacy/document/Tokenizer.java index ca5591795..7e27492dd 100644 --- a/source/net/yacy/document/Tokenizer.java +++ b/source/net/yacy/document/Tokenizer.java @@ -78,7 +78,7 @@ public class Tokenizer { int wordHandleCount = 0; //final int sentenceHandleCount = 0; int allwordcounter = 0; - final int allsentencecounter = 0; + int allsentencecounter = 0; int wordInSentenceCounter = 1; boolean comb_indexof = false, last_last = false, last_index = false; //final Map sentences = new HashMap(100); @@ -89,6 +89,14 @@ public class Tokenizer { try { while (wordenum.hasMoreElements()) { String word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH); + // handle punktuation (start new sentence) + if (word.length() == 1 && SentenceReader.punctuation(word.charAt(0))) { + // store sentence + currsentwords.clear(); + wordInSentenceCounter = 1; + allsentencecounter++; + continue; + } if (word.length() < wordminsize) continue; // get tags from autotagging @@ -144,40 +152,32 @@ public class Tokenizer { System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1); wordcache[wordcache.length - 1] = word; - // distinguish punctuation and words - wordlen = word.length(); - if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) { // TODO: wordlen == 1 never true (see earlier if < wordminsize ) - // store sentence - currsentwords.clear(); - wordInSentenceCounter = 1; + // check index.of detection + if (last_last && comb_indexof && word.equals("modified")) { + this.RESULT_FLAGS.set(flag_cat_indexof, true); + wordenum.pre(true); // parse lines as they come with CRLF + } + if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; + last_last = word.equals("last"); + last_index = word.equals("index"); + + // store word + allwordcounter++; + currsentwords.add(word); + Word wsp = this.words.get(word); + if (wsp != null) { + // word already exists + wordHandle = wsp.posInText; + wsp.inc(); } else { - // check index.of detection - if (last_last && comb_indexof && word.equals("modified")) { - this.RESULT_FLAGS.set(flag_cat_indexof, true); - wordenum.pre(true); // parse lines as they come with CRLF - } - if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true; - last_last = word.equals("last"); - last_index = word.equals("index"); - - // store word - allwordcounter++; - currsentwords.add(word); - Word wsp = this.words.get(word); - if (wsp != null) { - // word already exists - wordHandle = wsp.posInText; - wsp.inc(); - } else { - // word does not yet exist, create new word entry - wordHandle = ++wordHandleCount; // let start pos with 1 - wsp = new Word(wordHandle, wordInSentenceCounter, /* sentences.size() + */ 100); - wsp.flags = this.RESULT_FLAGS.clone(); - this.words.put(word.toLowerCase(), wsp); - } - // we now have the unique handle of the word, put it into the sentence: - wordInSentenceCounter++; + // word does not yet exist, create new word entry + wordHandle = ++wordHandleCount; // let start pos with 1 + wsp = new Word(wordHandle, wordInSentenceCounter, allsentencecounter + 100); // nomal sentence start at 100 ! + wsp.flags = this.RESULT_FLAGS.clone(); + this.words.put(word.toLowerCase(), wsp); } + // we now have the unique handle of the word, put it into the sentence: + wordInSentenceCounter++; } } finally { wordenum.close(); diff --git a/source/net/yacy/document/WordTokenizer.java b/source/net/yacy/document/WordTokenizer.java index 69d78ae71..25caf88ac 100644 --- a/source/net/yacy/document/WordTokenizer.java +++ b/source/net/yacy/document/WordTokenizer.java @@ -56,7 +56,7 @@ public class WordTokenizer implements Enumeration { private StringBuilder nextElement0() { StringBuilder s; while (this.e.hasMoreElements()) { - s = this.e.nextElement(); // next word (punctuation and invisible chars filtered) + s = this.e.nextElement(); // next word (invisible chars filtered) return s; } return null; @@ -118,7 +118,13 @@ public class WordTokenizer implements Enumeration { for (int i = 0; i < r.length(); i++) { // tokenize one sentence c = r.charAt(i); if (SentenceReader.punctuation(c)) { // punctuation check is simple/quick, do it before invisible - if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} + if (sb.length() > 0) { + this.s.add(sb); + sb = new StringBuilder(1); + } + sb.append(c); + this.s.add(sb); + sb = new StringBuilder(20); } else if (SentenceReader.invisible(c)) { // ! currently punctuation again checked by invisible() if (sb.length() > 0) {this.s.add(sb); sb = new StringBuilder(20);} } else { diff --git a/test/java/net/yacy/document/WordTokenizerTest.java b/test/java/net/yacy/document/WordTokenizerTest.java index c32e71ead..7f4250953 100644 --- a/test/java/net/yacy/document/WordTokenizerTest.java +++ b/test/java/net/yacy/document/WordTokenizerTest.java @@ -22,8 +22,12 @@ public class WordTokenizerTest { int cnt = 0; while (wt.hasMoreElements()) { StringBuilder sb = wt.nextElement(); - assertEquals("word", sb.toString()); - cnt++; + if (sb.length() > 1) { // skip punktuation + assertEquals("word", sb.toString()); + cnt++; + } else { + assertTrue("punktuation", SentenceReader.punctuation(sb.charAt(0))); + } } wt.close(); assertEquals(10, cnt); From 25a3c7a6d0ad5ddc0213e8b407cb275871117e62 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Wed, 7 Sep 2016 06:48:52 +0200 Subject: [PATCH 28/28] catch exception and write end of object --- .../solr/responsewriter/YJsonResponseWriter.java | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java index d32254266..b087e3d5a 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/YJsonResponseWriter.java @@ -34,6 +34,7 @@ import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.responsewriter.OpensearchResponseWriter.ResHead; import net.yacy.cora.protocol.HeaderFramework; +import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.JSONObject; import net.yacy.data.URLLicense; import net.yacy.search.schema.CollectionSchema; @@ -217,7 +218,13 @@ public class YJsonResponseWriter implements QueryResponseWriter { if (i < responseCount - 1) { writer.write(",\n".toCharArray()); } - } catch (final Throwable ee) {} + } catch (final Throwable ee) { + ConcurrentLog.logException(ee); + writer.write("\"description\":\"\"\n}\n"); + if (i < responseCount - 1) { + writer.write(",\n".toCharArray()); + } + } } writer.write("],\n".toCharArray());