From 262b23532dfb638e26d89367decb84cffaedf490 Mon Sep 17 00:00:00 2001 From: HeliosLHC <6173500+HeliosLHC@users.noreply.github.com> Date: Sun, 5 May 2024 23:23:34 +0200 Subject: [PATCH 1/4] reduce image size --- docker/Dockerfile | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index 28444c932..c2f90c41c 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -6,12 +6,14 @@ # run with # docker run -d --name yacy -p 8090:8090 -p 8443:8443 -v yacy_data:/opt/yacy_search_server/DATA --log-opt max-size=200m --log-opt max-file=2 yacy/yacy_search_server:latest - +## build base +FROM eclipse-temurin:11-jdk-jammy AS base +RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/* ## build app -FROM eclipse-temurin:11-jdk-jammy AS appbuilder +FROM base AS appbuilder -RUN apt-get update && apt-get install -yq ant git curl wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/* +RUN apt-get update && apt-get install -yq ant git curl && rm -rf /var/lib/apt/lists/* RUN java -version WORKDIR /opt @@ -20,8 +22,12 @@ COPY . /opt/yacy_search_server/ RUN ant compile -f /opt/yacy_search_server/build.xml && \ apt-get purge -yq --auto-remove ant && \ apt-get clean && \ - rm -rf /var/lib/apt/lists/* && \ - rm -rf /opt/yacy_search_server/.git + rm -rf /var/lib/apt/lists/* + +WORKDIR /opt/yacy_search_server/ +RUN git rev-parse HEAD > .git/shallow && \ + git tag -l | xargs git tag -d && \ + git gc --prune=now # Set initial admin password: "yacy" (encoded with custom yacy md5 function net.yacy.cora.order.Digest.encodeMD5Hex()) RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a0987a4aba1ec46d63c" /opt/yacy_search_server/defaults/yacy.init && \ @@ -31,16 +37,13 @@ RUN sed -i "/adminAccountBase64MD5=/c\adminAccountBase64MD5=MD5:8cffbc0d66567a09 ## build dist -FROM eclipse-temurin:11-jre-jammy +FROM base LABEL maintainer="Michael Peter Christen " -RUN apt-get update && apt-get install -yq wkhtmltopdf imagemagick xvfb ghostscript && rm -rf /var/lib/apt/lists/* - +RUN adduser --system --group --no-create-home --disabled-password yacy WORKDIR /opt -COPY . /opt/yacy_search_server/ -COPY --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server +COPY --chown=yacy:yacy --from=appbuilder /opt/yacy_search_server /opt/yacy_search_server -RUN adduser --system --group --no-create-home --disabled-password yacy && chown yacy:yacy -R /opt/yacy_search_server EXPOSE 8090 8443 VOLUME ["/opt/yacy_search_server/DATA"] USER yacy From b295e38969b99ae27fe494b82ff50f9a57ccc7f0 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 10 May 2024 12:13:44 +0200 Subject: [PATCH 2/4] fine-tuned the import process of jsonl files which had been missing to actually be able to make searches and browse the index with the host browser --- .../document/importer/JsonListImporter.java | 20 +++++++++++++ source/net/yacy/search/Switchboard.java | 29 ++++++++++--------- 2 files changed, 36 insertions(+), 13 deletions(-) diff --git a/source/net/yacy/document/importer/JsonListImporter.java b/source/net/yacy/document/importer/JsonListImporter.java index 06082d701..7392f39ae 100644 --- a/source/net/yacy/document/importer/JsonListImporter.java +++ b/source/net/yacy/document/importer/JsonListImporter.java @@ -148,6 +148,11 @@ public class JsonListImporter extends Thread implements Importer { } if ((json.opt("index") != null && json.length() == 1) || json.length() == 0) continue; final SolrInputDocument surrogate = new SolrInputDocument(); + + // set default values which act as constraints for a proper search + CollectionSchema.httpstatus_i.add(surrogate, 200); + + // get fields for json object jsonreader: for (final String key: json.keySet()) { final Object o = json.opt(key); if (o == null) continue; @@ -212,10 +217,19 @@ public class JsonListImporter extends Thread implements Importer { final String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.sku.getSolrFieldName(), durl.toNormalform(true)); surrogate.setField(CollectionSchema.id.getSolrFieldName(), id); + surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost()); surrogate.setField(CollectionSchema.host_id_s.getSolrFieldName(), id.substring(6)); continue jsonreader; } + if (key.equals("description")) { + // in YaCy descriptions are full-text indexed and also multi-value fields + final List descriptions = new ArrayList<>(); + descriptions.add(o.toString()); + CollectionSchema.description_txt.add(surrogate, descriptions); + continue jsonreader; + } if (key.equals("referrer_url_s")) { + // same patch as for urls which require re-calculation of id's; in this case we store the id only! final DigestURL durl = new DigestURL(o.toString()); final String id = ASCII.String(durl.hash()); surrogate.setField(CollectionSchema.referrer_id_s.getSolrFieldName(), id); @@ -236,6 +250,12 @@ public class JsonListImporter extends Thread implements Importer { continue jsonreader; } + // check if required fields are still missing and compute them + if (!surrogate.containsKey(CollectionSchema.host_s.getSolrFieldName())) { + final DigestURL durl = new DigestURL((String) surrogate.getFieldValue(CollectionSchema.sku.getSolrFieldName())); + surrogate.setField(CollectionSchema.host_s.getSolrFieldName(), durl.getHost()); + } + // regular situation, just read content of field surrogate.setField(key, o.toString()); } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 0d5e03c00..0c39b51cf 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2325,7 +2325,10 @@ public final class Switchboard extends serverSwitch { || s.endsWith(".xml.zip") || s.endsWith(".warc") || s.endsWith(".warc.gz") + || s.endsWith(".jsonl") + || s.endsWith(".jsonl.gz") || s.endsWith(".jsonlist") + || s.endsWith(".jsonlist.gz") || s.endsWith(".flatjson") ) { count++; } @@ -3167,9 +3170,9 @@ public final class Switchboard extends serverSwitch { } // check mustmatch pattern - Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); + final Pattern mustmatchurl = profile.indexUrlMustMatchPattern(); if (mustmatchurl != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchurl.matcher(urls).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustMatchPattern = " + mustmatchurl.pattern(); if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3177,9 +3180,9 @@ public final class Switchboard extends serverSwitch { } // check mustnotmatch - Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); + final Pattern mustnotmatchurl = profile.indexUrlMustNotMatchPattern(); if (mustnotmatchurl != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchurl.matcher(urls).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on url; indexUrlMustNotMatchPattern = " + mustnotmatchurl; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3192,13 +3195,13 @@ public final class Switchboard extends serverSwitch { // check canonical if (profile.noindexWhenCanonicalUnequalURL()) { - AnchorURL canonical = document.getCanonical(); - DigestURL source = document.dc_source(); + final AnchorURL canonical = document.getCanonical(); + final DigestURL source = document.dc_source(); if (canonical != null && source != null) { - String canonical_norm = canonical.toNormalform(true); - String source_norm = source.toNormalform(true); + final String canonical_norm = canonical.toNormalform(true); + final String source_norm = source.toNormalform(true); if (!canonical_norm.equals(source_norm)) { - String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; + final String info = "Not Condensed Resource '" + urls + "': denied, canonical != source; canonical = " +canonical_norm + "; source = " + source_norm; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3216,9 +3219,9 @@ public final class Switchboard extends serverSwitch { } // check content pattern must-match - Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); + final Pattern mustmatchcontent = profile.indexContentMustMatchPattern(); if (mustmatchcontent != CrawlProfile.MATCH_ALL_PATTERN && !mustmatchcontent.matcher(document.getTextString()).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustMatchPattern = " + mustmatchcontent.pattern() ; if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); @@ -3226,9 +3229,9 @@ public final class Switchboard extends serverSwitch { } // check content pattern must-not-match - Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); + final Pattern mustnotmatchcontent = profile.indexContentMustNotMatchPattern(); if (mustnotmatchcontent != CrawlProfile.MATCH_NEVER_PATTERN && mustnotmatchcontent.matcher(document.getTextString()).matches()) { - String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); + final String info = "Not Condensed Resource '" + urls + "': indexing prevented by regular expression on content; indexContentMustNotMatchPattern = " + mustnotmatchcontent.pattern(); if (this.log.isInfo()) this.log.info(info); // create a new errorURL DB entry this.crawlQueues.errorURL.push(in.queueEntry.url(), in.queueEntry.depth(), profile, FailCategory.FINAL_PROCESS_CONTEXT, info, -1); From c2ad1950e88842a6211585379f5dd3a3ce05f280 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 10 May 2024 15:41:20 +0200 Subject: [PATCH 3/4] updated jetty to 9.4.54.v20240208 --- ivy.xml | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/ivy.xml b/ivy.xml index bfa4b96ce..952e47baf 100644 --- a/ivy.xml +++ b/ivy.xml @@ -68,22 +68,22 @@ - - - - - - - - - + + + + + + + + + - - - - - + + + + + From 59c0cb0f30b88a0c2a6cf84f07fba501fd835048 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Mon, 13 May 2024 02:10:24 +0200 Subject: [PATCH 4/4] fixed aarch64 dockerfile --- docker/Dockerfile.aarch64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/Dockerfile.aarch64 b/docker/Dockerfile.aarch64 index 0f536c9a1..b95cf9c84 100644 --- a/docker/Dockerfile.aarch64 +++ b/docker/Dockerfile.aarch64 @@ -5,7 +5,7 @@ FROM arm64v8/openjdk:17-buster # Install needed packages not in base image # (curl for sh scripts in /bin, and wkhtmltopdf,imagemagick,xvfb and ghostscript to enable PDF and image snapshot generation) -RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript && \ +RUN apt-get update && apt-get install -yq curl wkhtmltopdf imagemagick xvfb ghostscript ca-certificates-java && \ rm -rf /var/lib/apt/lists/* # trace java version