From a4498e17c0738c6ec65a38e31a5372741eb7fc8c Mon Sep 17 00:00:00 2001 From: reger Date: Sat, 8 Apr 2017 22:54:57 +0200 Subject: [PATCH 01/13] fix edit current user form to required post mehtod introduced with https://github.com/yacy/yacy_search_server/commit/cde237b68763c542da20038e5f62bea341ae1d37 --- htroot/ConfigAccounts_p.html | 4 ++-- htroot/ConfigAccounts_p.java | 25 +++++++++++++++++++++---- 2 files changed, 23 insertions(+), 6 deletions(-) diff --git a/htroot/ConfigAccounts_p.html b/htroot/ConfigAccounts_p.html index 40bd836aa..450e75f78 100644 --- a/htroot/ConfigAccounts_p.html +++ b/htroot/ConfigAccounts_p.html @@ -75,7 +75,7 @@
User Accounts -
+
Select user
@@ -128,7 +128,7 @@
-
+ #%env/templates/footer.template%# diff --git a/htroot/ConfigAccounts_p.java b/htroot/ConfigAccounts_p.java index 7ab11efc9..61bba4489 100644 --- a/htroot/ConfigAccounts_p.java +++ b/htroot/ConfigAccounts_p.java @@ -156,8 +156,8 @@ public class ConfigAccounts_p { //user != current_user //user=from userlist //current_user = edited user - } else if (post.containsKey("user") && !"newuser".equals(post.get("user"))){ - TransactionManager.checkPostTransaction(header, post); + } else if (post.containsKey("user") && !"newuser".equals(post.get("user"))) { + TransactionManager.checkPostTransaction(header, post); if (post.containsKey("change_user")) { //defaults for newuser are set above entry = sb.userDB.getEntry(post.get("user")); @@ -183,7 +183,7 @@ public class ConfigAccounts_p { sb.userDB.removeEntry(post.get("user")); } } else if (post.containsKey("change")) { //New User / edit User - TransactionManager.checkPostTransaction(header, post); + TransactionManager.checkPostTransaction(header, post); prop.put("text", "0"); prop.put("error", "0"); @@ -270,7 +270,24 @@ public class ConfigAccounts_p { prop.putHTML("text_username", username); prop.put("text", "2"); }//edit user + prop.putHTML("username", username); + if (entry != null) { + //TODO: set username read-only in html + prop.putHTML("current_user", entry.getUserName()); + prop.putHTML("username", entry.getUserName()); + prop.putHTML("firstname", entry.getFirstName()); + prop.putHTML("lastname", entry.getLastName()); + prop.putHTML("address", entry.getAddress()); + prop.put("timelimit", entry.getTimeLimit()); + prop.put("timeused", entry.getTimeUsed()); + int count = 0; + for (final AccessRight right : rights) { + prop.put("rights_" + count + "_set", entry.hasRight(right) ? "1" : "0"); + count++; + } + prop.put("rights", count); + } } //Generate Userlist @@ -289,4 +306,4 @@ public class ConfigAccounts_p { // return rewrite properties return prop; } -} + } From a39c00a93f375894436a7d88f9ef48f1c06a3623 Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 9 Apr 2017 02:09:32 +0200 Subject: [PATCH 02/13] add servlet to list user in UserDB and made user editor available in separate servlet for a quick and easy overview of configured user and selection for edit. --- htroot/ConfigAccountList_p.html | 29 ++++++ htroot/ConfigAccountList_p.java | 103 +++++++++++++++++++ htroot/ConfigAccounts_p.html | 2 +- htroot/ConfigAccounts_p.java | 4 +- htroot/ConfigUser_p.html | 67 +++++++++++++ htroot/ConfigUser_p.java | 172 ++++++++++++++++++++++++++++++++ 6 files changed, 374 insertions(+), 3 deletions(-) create mode 100644 htroot/ConfigAccountList_p.html create mode 100644 htroot/ConfigAccountList_p.java create mode 100644 htroot/ConfigUser_p.html create mode 100644 htroot/ConfigUser_p.java diff --git a/htroot/ConfigAccountList_p.html b/htroot/ConfigAccountList_p.html new file mode 100644 index 000000000..036a56502 --- /dev/null +++ b/htroot/ConfigAccountList_p.html @@ -0,0 +1,29 @@ + + + + YaCy '#[clientname]#': User Accounts + #%env/templates/metas.template%# + + + + #%env/templates/header.template%# + #%env/templates/submenuUseCaseAccount.template%# +

User List

+ +
User Accounts + + + + + #{userlist}# + + + + #{/userlist}# +
UserFirst nameLast nameAddressLast AccessRightsTimeTraffic
#[username]##[firstname]##[lastname]##[address]##[lastaccess]##[rights]##[time]##[traffic]#
+ +
+ +#%env/templates/footer.template%# + + diff --git a/htroot/ConfigAccountList_p.java b/htroot/ConfigAccountList_p.java new file mode 100644 index 000000000..e6ee44f63 --- /dev/null +++ b/htroot/ConfigAccountList_p.java @@ -0,0 +1,103 @@ +// ConfigAccountList_p.java +// ------------------------- +// (c) 2017 by reger24; https://github.com/reger24 +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.util.Date; +import java.util.Iterator; +import net.yacy.cora.date.GenericFormatter; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.data.UserDB; +import net.yacy.data.UserDB.AccessRight; +import net.yacy.search.Switchboard; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class ConfigAccountList_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard) env; + UserDB.Entry entry; + + if (sb.userDB == null) { + prop.put("userlist", 0); + return prop; + } + + //Generate Userlist + final Iterator it = sb.userDB.iterator(true); + int numUsers = 0; + while (it.hasNext()) { + entry = it.next(); + if (entry == null) { + continue; + } + prop.putHTML("userlist_" + numUsers + "_username", entry.getUserName()); + prop.putHTML("userlist_" + numUsers + "_lastname", entry.getLastName()); + prop.putHTML("userlist_" + numUsers + "_firstname", entry.getFirstName()); + prop.putHTML("userlist_" + numUsers + "_address", entry.getAddress()); + if (entry.getLastAccess() != null) { + prop.put("userlist_" + numUsers + "_lastaccess", GenericFormatter.FORMAT_SIMPLE.format(new Date(entry.getLastAccess()))); + } else { + prop.put("userlist_" + numUsers + "_lastaccess", "never"); + } + + final AccessRight[] rights = AccessRight.values(); + String rightStr = ""; + for (final AccessRight right : rights) { + if (entry.hasRight(right)) { + if (rightStr.isEmpty()) { + rightStr = right.getFriendlyName(); + } else { + rightStr += ", " + right.getFriendlyName(); + } + } + } + prop.putHTML("userlist_" + numUsers + "_rights", rightStr); + + long percent; + if (entry.getTrafficLimit() != null) { + long limit = entry.getTrafficLimit(); + long used = entry.getTrafficSize(); + percent = used * 100 / limit; + prop.put("userlist_" + numUsers + "_time", percent); + } else { + prop.put("userlist_" + numUsers + "_time", ""); + } + + percent = -1; + if (entry.getTimeLimit() > 0) { + long limit = entry.getTimeLimit(); + long used = entry.getTimeUsed(); + percent = used * 100 / limit; + prop.put("userlist_" + numUsers + "_traffic", percent); + } else { + prop.put("userlist_" + numUsers + "_traffic", ""); + } + + + numUsers++; + } + prop.put("userlist", numUsers); + + // return rewrite properties + return prop; + } +} diff --git a/htroot/ConfigAccounts_p.html b/htroot/ConfigAccounts_p.html index 450e75f78..4a95fc370 100644 --- a/htroot/ConfigAccounts_p.html +++ b/htroot/ConfigAccounts_p.html @@ -84,7 +84,7 @@ + or goto user account list
 
diff --git a/htroot/ConfigAccounts_p.java b/htroot/ConfigAccounts_p.java index 61bba4489..2b5a39f88 100644 --- a/htroot/ConfigAccounts_p.java +++ b/htroot/ConfigAccounts_p.java @@ -55,7 +55,7 @@ public class ConfigAccounts_p { /* Acquire a transaction token for the next POST form submission */ prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, TransactionManager.getTransactionToken(header)); - final Switchboard sb = Switchboard.getSwitchboard(); + final Switchboard sb = (Switchboard) env; UserDB.Entry entry = null; // admin password @@ -306,4 +306,4 @@ public class ConfigAccounts_p { // return rewrite properties return prop; } - } +} diff --git a/htroot/ConfigUser_p.html b/htroot/ConfigUser_p.html new file mode 100644 index 000000000..e284f984f --- /dev/null +++ b/htroot/ConfigUser_p.html @@ -0,0 +1,67 @@ + + + + YaCy '#[clientname]#': User Editor + #%env/templates/metas.template%# + + + #%env/templates/header.template%# + #%env/templates/submenuUseCaseAccount.template%# +

User Account Editor

+ + + #(text)# + :: +

User created: #[username]#

+ :: +

User changed: #[username]#

+ #(/text)# + #(error)# + :: +

Generic error.

+ :: +

Passwords do not match.

+ :: +

Username too short. Username must be >= 4 Characters.

+ :: +

Username already used (not allowed).

+ #(/error)# + +
+
Edit current user: #[username]# + + +
+
:
+
+
:
+
+
:
+
+
:
+
+
:
+
+
:
+
+
Rights:
+
+ #{rights}# +
+ #{/rights}# +
+
:
+
+
:
+
+
 
+
+ +
+
+
+
+ +#%env/templates/footer.template%# + + diff --git a/htroot/ConfigUser_p.java b/htroot/ConfigUser_p.java new file mode 100644 index 000000000..4572a1ae4 --- /dev/null +++ b/htroot/ConfigUser_p.java @@ -0,0 +1,172 @@ +// ConfigUser_p.java +// ----------------------- +// (c) 2017 by reger24; https://github.com/reger24 +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + +import java.util.EnumMap; +import java.util.Map; +import net.yacy.cora.order.Digest; +import net.yacy.cora.protocol.RequestHeader; +import net.yacy.cora.util.ConcurrentLog; +import net.yacy.data.UserDB; +import net.yacy.data.UserDB.AccessRight; +import net.yacy.search.Switchboard; +import net.yacy.search.SwitchboardConstants; +import net.yacy.server.serverObjects; +import net.yacy.server.serverSwitch; + +public class ConfigUser_p { + + public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, final serverSwitch env) { + + final serverObjects prop = new serverObjects(); + final Switchboard sb = (Switchboard) env; + + if (post != null && post.containsKey("cancel")) { + prop.put(serverObjects.ACTION_LOCATION, "ConfigAccountList_p.html"); + return prop; + } + + //default values + prop.put("current_user", "newuser"); + prop.put("username", ""); + prop.put("firstname", ""); + prop.put("lastname", ""); + prop.put("address", ""); + prop.put("timelimit", ""); + prop.put("timeused", ""); + + final AccessRight[] rights = AccessRight.values(); + int c = 0; + for (final AccessRight right : rights) { + prop.put("rights_" + c + "_name", right.toString()); + prop.put("rights_" + c + "_friendlyName", right.getFriendlyName()); + prop.put("rights_" + c + "_set", "0"); + c++; + } + prop.put("rights", c); + + prop.put("users", "0"); + + if (sb.userDB == null) { + return prop; + } + + if (post == null) { + //do nothing + } else if (post.containsKey("user") && !"newuser".equals(post.get("user"))) { + + UserDB.Entry entry = sb.userDB.getEntry(post.get("user")); + if (entry != null) { + //TODO: set username read-only in html + prop.putHTML("current_user", entry.getUserName()); + prop.putHTML("username", entry.getUserName()); + prop.putHTML("firstname", entry.getFirstName()); + prop.putHTML("lastname", entry.getLastName()); + prop.putHTML("address", entry.getAddress()); + prop.put("timelimit", entry.getTimeLimit()); + prop.put("timeused", entry.getTimeUsed()); + int count = 0; + for (final AccessRight right : rights) { + prop.put("rights_" + count + "_set", entry.hasRight(right) ? "1" : "0"); + count++; + } + prop.put("rights", count); + } + } else if (post.containsKey("change")) { // edit User + prop.put("text", "0"); + prop.put("error", "0"); + + final String username = post.get("username"); + final String pw1 = post.get("password"); + final String pw2 = post.get("password2"); + + if (pw1 == null || !pw1.equals(pw2)) { + prop.put("error", "2"); //PW does not match + return prop; + } + // do not allow same username as staticadmin + if (username.equalsIgnoreCase(sb.getConfig(SwitchboardConstants.ADMIN_ACCOUNT_USER_NAME, "admin"))) { + prop.put("error", "4"); + return prop; + } + final String firstName = post.get("firstname"); + final String lastName = post.get("lastname"); + final String address = post.get("address"); + final String timeLimit = post.get("timelimit"); + final String timeUsed = post.get("timeused"); + final Map rightsSet = new EnumMap(AccessRight.class); + + for (final AccessRight right : rights) { + rightsSet.put(right, post.containsKey(right.toString()) && "on".equals(post.get(right.toString())) ? "true" : "false"); + } + + UserDB.Entry entry = sb.userDB.getEntry(username); + + if (entry != null) { + try { + if (!"".equals(pw1)) { + // with prefix of encoding method (supported MD5: ) + entry.setProperty(UserDB.Entry.MD5ENCODED_USERPWD_STRING, "MD5:" + Digest.encodeMD5Hex(username + ":" + sb.getConfig(SwitchboardConstants.ADMIN_REALM, "YaCy") + ":" + pw1)); + } + + entry.setProperty(UserDB.Entry.USER_FIRSTNAME, firstName); + entry.setProperty(UserDB.Entry.USER_LASTNAME, lastName); + entry.setProperty(UserDB.Entry.USER_ADDRESS, address); + entry.setProperty(UserDB.Entry.TIME_LIMIT, timeLimit); + entry.setProperty(UserDB.Entry.TIME_USED, timeUsed); + + for (final AccessRight right : rights) { + entry.setProperty(right.toString(), rightsSet.get(right)); + } + + //TODO: set username read-only in html + prop.putHTML("current_user", entry.getUserName()); + prop.putHTML("username", entry.getUserName()); + prop.putHTML("firstname", entry.getFirstName()); + prop.putHTML("lastname", entry.getLastName()); + prop.putHTML("address", entry.getAddress()); + prop.put("timelimit", entry.getTimeLimit()); + prop.put("timeused", entry.getTimeUsed()); + int count = 0; + for (final AccessRight right : rights) { + prop.put("rights_" + count + "_set", entry.hasRight(right) ? "1" : "0"); + count++; + } + prop.put("rights", count); + + } catch (final Exception e) { + ConcurrentLog.logException(e); + } + + } else { + prop.put("error", "1"); + } + prop.putHTML("text_username", username); + prop.put("text", "2"); + + prop.putHTML("username", username); + } else if (post.containsKey("delete")) { + sb.userDB.removeEntry(post.get("username")); + prop.put(serverObjects.ACTION_LOCATION, "ConfigAccountList_p.html"); // jump back to user list + } + + // return rewrite properties + return prop; + } +} From 05a1b14b4af9059f1bc893b02ef62b1d994ff47a Mon Sep 17 00:00:00 2001 From: reger Date: Sun, 9 Apr 2017 21:42:05 +0200 Subject: [PATCH 03/13] add missing text from ConfigRobotsTxt_p to master.lng and link to Translation Editor to Translation News page. --- htroot/TransNews_p.html | 3 ++- locales/master.lng.xlf | 6 ++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/htroot/TransNews_p.html b/htroot/TransNews_p.html index 7e69deedb..840764502 100644 --- a/htroot/TransNews_p.html +++ b/htroot/TransNews_p.html @@ -14,7 +14,8 @@ You can share your local addition to translations and distribute it to other peers. The remote peer can vote on your translation and add it to the own local translation.
(#[transsize]# entries available)   -   You can check your outgoing messages here +   You can check your outgoing messages here. + To edit or add local translations you can use Translator_p.html.

#(errmsg)#::

Please activate a different language here

#(/errmsg)# diff --git a/locales/master.lng.xlf b/locales/master.lng.xlf index 6ca2b5827..64f3750dd 100644 --- a/locales/master.lng.xlf +++ b/locales/master.lng.xlf @@ -1936,6 +1936,9 @@ Here you can set up a robots.txt for all webcrawlers that try to access the webinterface of your peer. + + is a voluntary agreement most search-engines (including YaCy) follow. + It disallows crawlers to access webpages or even entire domains. @@ -8973,6 +8976,9 @@ >here< + + To edit or add local translations you can use + File: From 7b80189bda73b7118d996b4ff07681ac06cf5f60 Mon Sep 17 00:00:00 2001 From: reger Date: Mon, 10 Apr 2017 22:42:06 +0200 Subject: [PATCH 04/13] Activate hosts navigator plugin. This includes rwi results in the navigator count. This might be tangential related to http://mantis.tokeek.de/view.php?id=736 as the example includes a local index search, while rwi results are not counted. --- htroot/yacysearchtrailer.html | 8 ---- htroot/yacysearchtrailer.java | 42 ------------------- htroot/yacysearchtrailer.json | 14 +------ htroot/yacysearchtrailer.xml | 7 ---- .../search/navigator/NavigatorPlugins.java | 6 +-- .../net/yacy/search/query/QueryModifier.java | 4 +- source/net/yacy/search/query/SearchEvent.java | 33 +++++++-------- 7 files changed, 22 insertions(+), 92 deletions(-) diff --git a/htroot/yacysearchtrailer.html b/htroot/yacysearchtrailer.html index 0057f9b7d..d72c11f2e 100644 --- a/htroot/yacysearchtrailer.html +++ b/htroot/yacysearchtrailer.html @@ -156,14 +156,6 @@ function toggleVisibility(name, count) { } } -#(nav-domains)#:: - - -#(/nav-domains)# #(nav-languages)#:: - - - -
:: +

#(status)#::

+ #(/status)#

Import Process
Thread:
#[thread]#
diff --git a/htroot/IndexImportMediawiki_p.java b/htroot/IndexImportMediawiki_p.java index 095aaf106..574892d46 100644 --- a/htroot/IndexImportMediawiki_p.java +++ b/htroot/IndexImportMediawiki_p.java @@ -23,8 +23,11 @@ // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA import java.io.File; +import java.net.MalformedURLException; +import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.data.TransactionManager; import net.yacy.document.importer.MediawikiImporter; import net.yacy.search.Switchboard; import net.yacy.server.serverObjects; @@ -54,6 +57,11 @@ public class IndexImportMediawiki_p { if (MediawikiImporter.job != null && MediawikiImporter.job.isAlive()) { // one import is running, no option to insert anything prop.put("import", 1); + final String jobErrorMessage = MediawikiImporter.job.status(); + if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) { + prop.put("import_status", 1); + prop.put("import_status_message", jobErrorMessage); + } prop.put("import_thread", "running"); prop.put("import_dump", MediawikiImporter.job.source()); prop.put("import_count", MediawikiImporter.job.count()); @@ -64,33 +72,63 @@ public class IndexImportMediawiki_p { prop.put("import_remainingMinutes", (MediawikiImporter.job.remainingTime() / 60) % 60); } else { prop.put("import", 0); + if(MediawikiImporter.job != null) { + /* Report eventual fail report from the last terminated import (for example an HTTP 404 status) + * that else could be missed by the user because of page refresh */ + final String jobErrorMessage = MediawikiImporter.job.status(); + if( jobErrorMessage != null && !jobErrorMessage.isEmpty()) { + prop.put("import_prevStatus", 1); + prop.put("import_prevStatus_message", jobErrorMessage); + } + } if (post == null) { prop.put("import_status", 0); + + /* Acquire a transaction token for the next POST form submission */ + final String token = TransactionManager.getTransactionToken(header); + prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token); + prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token); + } else { if (post.containsKey("file")) { + /* Check the transaction is valid */ + TransactionManager.checkPostTransaction(header, post); + String file = post.get("file"); - if (file.startsWith("file://")) file = file.substring(7); - if (file.startsWith("http")) { - prop.put("import_status", 1); - } else { - final File sourcefile = new File(file); - if (!sourcefile.exists()) { - prop.put("import_status", 2); - prop.put("import_status_sourceFile", sourcefile.getAbsolutePath()); - } else if(!sourcefile.canRead()) { - prop.put("import_status", 3); - prop.put("import_status_sourceFile", sourcefile.getAbsolutePath()); - } else if(sourcefile.isDirectory()) { - prop.put("import_status", 4); - prop.put("import_status_sourceFile", sourcefile.getAbsolutePath()); - } else { - MediawikiImporter.job = new MediawikiImporter(sourcefile, sb.surrogatesInPath); - MediawikiImporter.job.start(); - prop.put("import_dump", MediawikiImporter.job.source()); - prop.put("import_thread", "started"); - prop.put("import", 1); - } - } + MultiProtocolURL sourceURL = null; + int status = 0; + String sourceFilePath = ""; + try { + sourceURL = new MultiProtocolURL(file); + if(sourceURL.isFile()) { + final File sourcefile = sourceURL.getFSFile(); + sourceFilePath = sourcefile.getAbsolutePath(); + if (!sourcefile.exists()) { + status = 2; + } else if (!sourcefile.canRead()) { + status = 3; + } else if (sourcefile.isDirectory()) { + status = 4; + } + } + } catch (MalformedURLException e) { + status = 1; + } + if (status == 0) { + MediawikiImporter.job = new MediawikiImporter(sourceURL, sb.surrogatesInPath); + MediawikiImporter.job.start(); + prop.put("import_dump", MediawikiImporter.job.source()); + prop.put("import_thread", "started"); + prop.put("import", 1); + } else { + prop.put("import_status", status); + prop.put("import_status_sourceFile", sourceFilePath); + + /* Acquire a transaction token for the next POST form submission */ + final String token = TransactionManager.getTransactionToken(header); + prop.put(TransactionManager.TRANSACTION_TOKEN_PARAM, token); + prop.put("import_" + TransactionManager.TRANSACTION_TOKEN_PARAM, token); + } prop.put("import_count", 0); prop.put("import_speed", 0); prop.put("import_runningHours", 0); diff --git a/source/net/yacy/cora/document/id/MultiProtocolURL.java b/source/net/yacy/cora/document/id/MultiProtocolURL.java index 098308a59..7a42826a2 100644 --- a/source/net/yacy/cora/document/id/MultiProtocolURL.java +++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java @@ -48,6 +48,8 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.http.HttpStatus; + import jcifs.smb.SmbException; import jcifs.smb.SmbFile; import jcifs.smb.SmbFileInputStream; @@ -62,6 +64,7 @@ import net.yacy.cora.protocol.ftp.FTPClient; import net.yacy.cora.protocol.http.HTTPClient; import net.yacy.cora.util.CommonPattern; import net.yacy.cora.util.ConcurrentLog; +import net.yacy.cora.util.HTTPInputStream; import net.yacy.crawler.retrieval.Response; /** @@ -2290,7 +2293,14 @@ public class MultiProtocolURL implements Serializable, ComparablePlease don't forget to release resources by closing the returned stream. + * @param agent user agent identifier to use when the protocul is HTTP + * @return an open input stream + * @throws IOException when the stream can not be opened + */ + public InputStream getInputStream(final ClientIdentification.Agent agent) throws IOException { if (isFile()) return new BufferedInputStream(new FileInputStream(getFSFile())); if (isSMB()) return new BufferedInputStream(new SmbFileInputStream(getSmbFile())); if (isFTP()) { @@ -2303,7 +2313,12 @@ public class MultiProtocolURL implements Serializable, Comparable out = new ArrayBlockingQueue(threads * 10); final wikiparserrecord poison = newRecord(); + BufferedReader reader = null; try { - String targetstub = this.sourcefile.getName(); + String targetstub = this.sourcefile.getFileName(); int p = targetstub.lastIndexOf("\\."); if (p > 0) targetstub = targetstub.substring(0, p); - InputStream is = new BufferedInputStream(new FileInputStream(this.sourcefile), 1024 * 1024); - if (this.sourcefile.getName().endsWith(".bz2")) { + InputStream is = new BufferedInputStream(this.sourcefile.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), 1024 * 1024); + if (this.sourcefile.getFileName().endsWith(".bz2")) { is = new BZip2CompressorInputStream(is); - } else if (this.sourcefile.getName().endsWith(".gz")) { + } else if (this.sourcefile.getFileName().endsWith(".gz")) { is = new GZIPInputStream(is); } - final BufferedReader r = new BufferedReader(new java.io.InputStreamReader(is, StandardCharsets.UTF_8), 4 * 1024 * 1024); + reader = new BufferedReader(new java.io.InputStreamReader(is, StandardCharsets.UTF_8), 4 * 1024 * 1024); String t; StringBuilder sb = new StringBuilder(); boolean page = false, text = false; @@ -181,7 +190,7 @@ public class MediawikiImporter extends Thread implements Importer { wikiparserrecord record; int q; - while ((t = r.readLine()) != null) { + while ((t = reader.readLine()) != null) { if ((p = t.indexOf("",0)) >= 0 && (q = t.indexOf("", p)) > 0) { //urlStub = "http://" + lang + ".wikipedia.org/wiki/"; this.urlStub = t.substring(p + 6, q); @@ -256,7 +265,6 @@ public class MediawikiImporter extends Thread implements Importer { sb.append('\n'); } } - r.close(); try { for (int i = 0; i < threads; i++) { @@ -265,23 +273,24 @@ public class MediawikiImporter extends Thread implements Importer { for (int i = 0; i < threads; i++) { consumerResults[i].get(10000, TimeUnit.MILLISECONDS); } - } catch (final InterruptedException e) { - ConcurrentLog.logException(e); - } catch (final ExecutionException e) { - ConcurrentLog.logException(e); - } catch (final TimeoutException e) { - ConcurrentLog.logException(e); } catch (final Exception e) { + this.errorMessage = e.getMessage(); ConcurrentLog.logException(e); } finally { out.put(poison); // output thread condition (for file.close) writerResult.get(10000, TimeUnit.MILLISECONDS); } - } catch (final IOException e) { - ConcurrentLog.logException(e); } catch (final Exception e) { + this.errorMessage = e.getMessage(); ConcurrentLog.logException(e); } finally { + if(reader != null) { + try { + reader.close(); + } catch (IOException e) { + ConcurrentLog.warn("WIKITRANSLATION", "Could not close dump reader : " + e.getMessage()); + } + } try { out.put(poison); // out keeps output file open until poisened, to close file if exception happend in this block } catch (InterruptedException ex) { } @@ -767,7 +776,7 @@ public class MediawikiImporter extends Thread implements Importer { System.out.println(" -index "); System.out.println(" -read "); System.out.println(" -find <wikipedia-dump>"); - System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir> <url-stub>"); + System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); ConcurrentLog.shutdown(); return; } @@ -779,17 +788,22 @@ public class MediawikiImporter extends Thread implements Importer { // DATA/HTCACHE/dewiki-20090311-pages-articles.xml.bz2 // DATA/SURROGATES/in/ http://de.wikipedia.org/wiki/ - if (s[0].equals("-convert") && s.length > 2) { - final File sourcefile = new File(s[1]); + if (s[0].equals("-convert")) { + if(s.length < 3) { + System.out.println("usage:"); + System.out.println(" -convert <wikipedia-dump-xml.bz2> <convert-target-dir>"); + ConcurrentLog.shutdown(); + return; + } final File targetdir = new File(s[2]); - // String urlStub = s[3]; // i.e. http://de.wikipedia.org/wiki/ - // String language = urlStub.substring(7,9); try { - final MediawikiImporter mi = new MediawikiImporter(sourcefile, targetdir); + final MediawikiImporter mi = new MediawikiImporter(new MultiProtocolURL(s[1]), targetdir); mi.start(); mi.join(); } catch (final InterruptedException e) { ConcurrentLog.logException(e); + } catch (MalformedURLException e) { + ConcurrentLog.logException(e); } } @@ -821,6 +835,11 @@ public class MediawikiImporter extends Thread implements Importer { } } finally { + try { + HTTPClient.closeConnectionManager(); + } catch (InterruptedException e) { + e.printStackTrace(); + } ConcurrentLog.shutdown(); } } diff --git a/source/net/yacy/document/parser/htmlParser.java b/source/net/yacy/document/parser/htmlParser.java index 3088815a1..77798ed5c 100644 --- a/source/net/yacy/document/parser/htmlParser.java +++ b/source/net/yacy/document/parser/htmlParser.java @@ -380,8 +380,20 @@ public class htmlParser extends AbstractParser implements Parser { locationSnapshot = new DigestURL(location.toNormalform(true) + "?_escaped_fragment_="); } Charset[] detectedcharsetcontainer = new Charset[]{null}; - ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null), maxLinks); - documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); + InputStream snapshotStream = null; + try { + snapshotStream = locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, snapshotStream, maxLinks); + documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot); + } finally { + if(snapshotStream != null) { + try { + snapshotStream.close(); + } catch(IOException e) { + AbstractParser.log.warn("Could not close snapshot stream : " + e.getMessage()); + } + } + } AbstractParser.log.info("parse snapshot "+locationSnapshot.toString() + " additional to " + location.toString()); } catch (IOException | Failure ex) { } return documentSnapshot; diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 1909178a9..e8f04318f 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -394,7 +394,7 @@ public final class LoaderDispatcher { inStream = this.httpLoader.openInputStream(request, crawlProfile, 1, maxFileSize, blacklistType, agent); } else if (protocol.equals("ftp") || protocol.equals("smb") || protocol.equals("file")) { // may also open directly stream with ftp loader - inStream = url.getInputStream(agent, null, null); + inStream = url.getInputStream(agent); } else { throw new IOException("Unsupported protocol '" + protocol + "' in url " + url); } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index 7512393a3..2cba366bd 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -28,6 +28,7 @@ package net.yacy.search.index; import java.io.File; import java.io.IOException; +import java.io.InputStream; import java.net.MalformedURLException; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; @@ -158,10 +159,20 @@ public class DocumentIndex extends Segment { } catch (final Exception e ) { length = -1; } + InputStream sourceStream = null; try { - documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null)); + sourceStream = url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent); + documents = TextParser.parseSource(url, null, null, new VocabularyScraper(), timezoneOffset, 0, length, sourceStream); } catch (final Exception e ) { throw new IOException("cannot parse " + url.toNormalform(false) + ": " + e.getMessage()); + } finally { + if(sourceStream != null) { + try { + sourceStream.close(); + } catch(IOException e) { + ConcurrentLog.warn("DocumentIndex", "Could not close source stream : " + e.getMessage()); + } + } } //Document document = Document.mergeDocuments(url, null, documents); final SolrInputDocument[] rows = new SolrInputDocument[documents.length]; From d3df8a46c426e64835f4f2f25914aeaa200368c5 Mon Sep 17 00:00:00 2001 From: reger <reger18@arcor.de> Date: Fri, 14 Apr 2017 21:14:26 +0200 Subject: [PATCH 12/13] fix unresolved_pattern on missing post parameter api/message.html --- htroot/yacy/message.java | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/htroot/yacy/message.java b/htroot/yacy/message.java index 9f46515d4..59a07e3ca 100644 --- a/htroot/yacy/message.java +++ b/htroot/yacy/message.java @@ -59,11 +59,15 @@ public final class message { } public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) { - if (post == null || env == null) { return null; } // return variable that accumulates replacements final Switchboard sb = (Switchboard) env; final serverObjects prop = new serverObjects(); + + prop.put("messagesize", "0"); + prop.put("attachmentsize", "0"); + prop.put("response", "-1"); // request rejected + if ((post == null) || (env == null)) return prop; if (!Protocol.authentifyRequest(post, env)) return prop; @@ -74,22 +78,17 @@ public final class message { final int messagesize = 10240; final int attachmentsize = 0; - prop.put("messagesize", "0"); - prop.put("attachmentsize", "0"); - final String youare = post.get("youare", ""); // seed hash of the target peer, needed for network stability // check if we are the right target and requester has correct information about this peer if ((sb.peers.mySeed() == null) || (!(sb.peers.mySeed().hash.equals(youare)))) { // this request has a wrong target - prop.put("response", "-1"); // request rejected return prop; } - if ((sb.isRobinsonMode()) && - (!((sb.isPublicRobinson()) || - (sb.isInMyCluster(header.getRemoteAddr()))))) { + if ((sb.isRobinsonMode()) + && (!((sb.isPublicRobinson()) + || (sb.isInMyCluster(header.getRemoteAddr()))))) { // if we are a robinson cluster, answer only if this client is known by our network definition - prop.put("response", "-1"); // request rejected return prop; } @@ -107,7 +106,7 @@ public final class message { // post: post message to message board final String otherSeedString = post.get("myseed", ""); if (otherSeedString.isEmpty()) { - prop.put("response", "-1"); // request rejected + // request rejected return prop; } //Date remoteTime = yacyCore.parseUniversalDate((String) post.get(yacySeed.MYTIME)); // read remote time @@ -115,20 +114,20 @@ public final class message { try { otherSeed = Seed.genRemoteSeed(otherSeedString, false, ias == null ? null : ias.getHostAddress()); } catch (final IOException e) { - prop.put("response", "-1"); // don't accept messages for bad seeds + // don't accept messages for bad seeds return prop; } String subject = crypt.simpleDecode(post.get("subject", "")); // message's subject String message = crypt.simpleDecode(post.get("message", "")); // message body if (subject == null || message == null) { - prop.put("response", "-1"); // don't accept empty messages + // don't accept empty messages return prop; } message = message.trim(); subject = subject.trim(); if (subject.isEmpty() || message.isEmpty()) { - prop.put("response", "-1"); // don't accept empty messages + // don't accept empty messages return prop; } @@ -156,7 +155,6 @@ public final class message { } } -// System.out.println("respond = " + prop.toString()); // return rewrite properties return prop; From bec34d35468bc952e71028a1630fef5026bf448a Mon Sep 17 00:00:00 2001 From: reger <reger18@arcor.de> Date: Sun, 16 Apr 2017 04:25:29 +0200 Subject: [PATCH 13/13] Add url input field as source for WarcImporter allowing to import warc from url without prior download. --- htroot/IndexImportWarc_p.html | 17 ++++--- htroot/IndexImportWarc_p.java | 51 ++++++++++++++----- .../yacy/document/importer/WarcImporter.java | 12 +++++ 3 files changed, 59 insertions(+), 21 deletions(-) diff --git a/htroot/IndexImportWarc_p.html b/htroot/IndexImportWarc_p.html index d6003bc9e..0d490eb9e 100644 --- a/htroot/IndexImportWarc_p.html +++ b/htroot/IndexImportWarc_p.html @@ -22,13 +22,16 @@ You can download warc archives for example here <a href="https://archive.org/search.php?query=subject%3A%22warcarchives%22&and[]=subject%3A%22warcarchives%22" target="_blank">Internet Archive</a>. </p> - <div class="input-group"> - <span style="display: inline-block"> - <input name="file" type="file" value="" size="75" /></span> - <div class="btn-group"> - <input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" /> - </div> - </div> + <dl> + <dt class="TableCellDark"><label for="file">File:</label></dt> + <dd><input name="file" id="file" type="file" value="" size="75" /></dd> + <dt></dt> + <dd>or</dd> + <dt class="TableCellDark"><label for="url">Url:</label></dt> + <dd><input name="url" id="url" value="" size="75"/></dd> + <dt></dt> + <dd><input name="submit" class="btn btn-primary" type="submit" value="Import Warc File" /></dd> + </dl> </fieldset> </form> diff --git a/htroot/IndexImportWarc_p.java b/htroot/IndexImportWarc_p.java index f503fe98b..6a3127952 100644 --- a/htroot/IndexImportWarc_p.java +++ b/htroot/IndexImportWarc_p.java @@ -18,6 +18,10 @@ import java.io.File; import java.io.FileNotFoundException; +import java.io.IOException; +import java.net.MalformedURLException; +import net.yacy.cora.document.id.MultiProtocolURL; +import net.yacy.cora.protocol.ClientIdentification; import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.importer.WarcImporter; @@ -45,23 +49,42 @@ public class IndexImportWarc_p { } else { prop.put("import", 0); if (post != null) { - if (post.containsKey("file")) { - String file = post.get("file"); - final File sourcefile = new File(file); - if (sourcefile.exists()) { - try { - WarcImporter wi = new WarcImporter(sourcefile); - wi.start(); - prop.put("import_thread", "started"); - } catch (FileNotFoundException ex) { - prop.put("import_thread", "Error: file not found [" + file + "]"); + if (post.containsKey("file") || post.containsKey("url")) { + String filename = post.get("file"); + if (filename != null && filename.length() > 0) { + final File sourcefile = new File(filename); + if (sourcefile.exists()) { + try { + WarcImporter wi = new WarcImporter(sourcefile); + wi.start(); + prop.put("import_thread", "started"); + } catch (FileNotFoundException ex) { + prop.put("import_thread", "Error: file not found [" + filename + "]"); + } + prop.put("import", 1); + prop.put("import_warcfile", filename); + } else { + prop.put("import_warcfile", ""); + prop.put("import_thread", "Error: file not found [" + filename + "]"); } - prop.put("import_warcfile", file); } else { - prop.put("import_warcfile", ""); - prop.put("import_thread", "Error: file not found [" + file + "]"); + String urlstr = post.get("url"); + if (urlstr != null && urlstr.length() > 0) { + try { + MultiProtocolURL url = new MultiProtocolURL(urlstr); + WarcImporter wi = new WarcImporter(url.getInputStream(ClientIdentification.yacyInternetCrawlerAgent), urlstr); + wi.start(); + prop.put("import_thread", "started"); + } catch (MalformedURLException ex) { + prop.put("import_thread", ex.getMessage()); + } catch (IOException ex) { + prop.put("import_thread", ex.getMessage()); + } + prop.put("import", 1); + prop.put("import_warcfile", urlstr); + } } - prop.put("import", 1); + prop.put("import_count", 0); prop.put("import_speed", 0); prop.put("import_runningHours", 0); diff --git a/source/net/yacy/document/importer/WarcImporter.java b/source/net/yacy/document/importer/WarcImporter.java index e921765ce..5ad4582b3 100644 --- a/source/net/yacy/document/importer/WarcImporter.java +++ b/source/net/yacy/document/importer/WarcImporter.java @@ -73,6 +73,18 @@ public class WarcImporter extends Thread implements Importer { sourceSize = -1; } + /** + * Init the WarcImporter with input stream with a informational filename or + * url als info for calls to the importer methode source() which returns + * the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream) + * @param f the input stream to read the warc archive from + * @param urlinfo a info like the url or the filename + */ + public WarcImporter (InputStream f, String urlinfo) { + this(f); + name = urlinfo; + } + public WarcImporter(File f) throws FileNotFoundException{ name = f.getName(); sourceSize = f.length();