From 6f4fe4b175a33042c0cb4c68d48899cc1fdc3e7e Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Fri, 8 May 2015 14:01:30 +0200 Subject: [PATCH] revert of 8a7c68e4c7f6a682e3ef656b423ce1ad76b42caa keeping surrogates after processing is essential for some users. If the space they are taking is too high, please set up an automatic deletion process (like a cronjob). --- .../document/importer/OAIPMHImporter.java | 38 +++++++++++++- source/net/yacy/search/Switchboard.java | 49 ++++++++++++++++--- .../net/yacy/search/SwitchboardConstants.java | 4 +- 3 files changed, 82 insertions(+), 9 deletions(-) diff --git a/source/net/yacy/document/importer/OAIPMHImporter.java b/source/net/yacy/document/importer/OAIPMHImporter.java index 173332b48..8c8885e5e 100644 --- a/source/net/yacy/document/importer/OAIPMHImporter.java +++ b/source/net/yacy/document/importer/OAIPMHImporter.java @@ -22,8 +22,13 @@ package net.yacy.document.importer; +import java.io.File; import java.io.IOException; import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import net.yacy.cora.date.GenericFormatter; @@ -176,12 +181,43 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable getLoadedOAIServer(File surrogatesIn, File surrogatesOut) { + Map map = getLoadedOAIServer(surrogatesOut); + map.putAll((Map) getLoadedOAIServer(surrogatesIn).entrySet()); + return map; + } + + private static Map getLoadedOAIServer(File surrogates) { + HashMap map = new HashMap(); + //oaipmh_opus.bsz-bw.de_20091102113118728.xml + for (String s: surrogates.list()) { + if (s.startsWith(filenamePrefix) && s.endsWith(".xml") && s.charAt(s.length() - 22) == filenameSeparationChar) { + try { + Date fd = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(s.substring(s.length() - 21, s.length() - 4), 0).getTime(); + String hostID = s.substring(7, s.length() - 22); + Date md = map.get(hostID); + if (md == null || fd.after(md)) map.put(hostID, fd); + } catch (final ParseException e) { + ConcurrentLog.logException(e); + } + } + } + return map; + } + public static final char hostReplacementChar = '_'; public static final char filenameSeparationChar = '.'; public static final String filenamePrefix = "oaipmh"; /** - * compute a host id + * compute a host id that is also used in the getLoadedOAIServer method for the map key * @param source * @return a string that is a key for the given host */ diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 404cf574b..96d4c6864 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -37,14 +37,17 @@ package net.yacy.search; import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; import java.io.BufferedReader; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; import java.io.Reader; import java.net.MalformedURLException; import java.security.NoSuchAlgorithmException; @@ -74,6 +77,7 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.regex.Pattern; import java.util.regex.PatternSyntaxException; import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; import java.util.zip.ZipEntry; import java.util.zip.ZipInputStream; @@ -245,7 +249,7 @@ public final class Switchboard extends serverSwitch { public File networkRoot; public File queuesRoot; public File surrogatesInPath; - //public File surrogatesOutPath; + public File surrogatesOutPath; public Segment index; public LoaderDispatcher loader; public CrawlSwitchboard crawler; @@ -715,14 +719,23 @@ public final class Switchboard extends serverSwitch { SwitchboardConstants.SURROGATES_IN_PATH_DEFAULT); this.log.info("surrogates.in Path = " + this.surrogatesInPath.getAbsolutePath()); this.surrogatesInPath.mkdirs(); -/* this.surrogatesOutPath = + this.surrogatesOutPath = getDataPath( SwitchboardConstants.SURROGATES_OUT_PATH, SwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT); this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath()); this.surrogatesOutPath.mkdirs(); -*/ - + + // copy opensearch heuristic config (if not exist) + final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf"); + if (!osdConfig.exists()) { + final File osdDefaultConfig = new File("defaults/heuristicopensearch.conf"); + this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath()); + try { + Files.copy(osdDefaultConfig, osdConfig); + } catch (final IOException ex) { } + } + // create the release download directory this.releasePath = getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT); @@ -1866,6 +1879,8 @@ public final class Switchboard extends serverSwitch { if ( !infile.exists() || !infile.canWrite() || !infile.canRead() ) { return false; } + final File outfile = new File(this.surrogatesOutPath, s); + //if (outfile.exists()) return false; boolean moved = false; if ( s.endsWith("xml.zip") ) { // open the zip file with all the xml files in it @@ -1889,7 +1904,7 @@ public final class Switchboard extends serverSwitch { } catch (final IOException e ) { ConcurrentLog.logException(e); } finally { - moved = infile.delete(); + moved = infile.renameTo(outfile); if (zis != null) try {zis.close();} catch (final IOException e) {} } return moved; @@ -1905,7 +1920,29 @@ public final class Switchboard extends serverSwitch { ConcurrentLog.logException(e); } finally { if (!shallTerminate()) { - moved = infile.delete(); + moved = infile.renameTo(outfile); + if ( moved ) { + // check if this file is already compressed, if not, compress now + if ( !outfile.getName().endsWith(".gz") ) { + final String gzname = outfile.getName() + ".gz"; + final File gzfile = new File(outfile.getParentFile(), gzname); + try { + final OutputStream os = + new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile))); + BufferedInputStream bis = new BufferedInputStream(new FileInputStream(outfile)); + FileUtils.copy(bis, os); + os.close(); + bis.close(); + if ( gzfile.exists() ) { + FileUtils.deletedelete(outfile); + } + } catch (final FileNotFoundException e ) { + ConcurrentLog.logException(e); + } catch (final IOException e ) { + ConcurrentLog.logException(e); + } + } + } } if (is != null) try {is.close();} catch (IOException e) {} } diff --git a/source/net/yacy/search/SwitchboardConstants.java b/source/net/yacy/search/SwitchboardConstants.java index 53779d0c4..928f1887c 100644 --- a/source/net/yacy/search/SwitchboardConstants.java +++ b/source/net/yacy/search/SwitchboardConstants.java @@ -407,8 +407,8 @@ public final class SwitchboardConstants { public static final String SURROGATES_IN_PATH = "surrogates.in"; public static final String SURROGATES_IN_PATH_DEFAULT = "DATA/SURROGATES/in"; - //public static final String SURROGATES_OUT_PATH = "surrogates.out"; - //public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out"; + public static final String SURROGATES_OUT_PATH = "surrogates.out"; + public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out"; public static final String DICTIONARY_SOURCE_PATH = "dictionaries"; public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";