keeping surrogates after processing is essential for some users. If the
space they are taking is too high, please set up an automatic deletion
process (like a cronjob).
pull/8/head
Michael Peter Christen 10 years ago
parent 213401a446
commit 6f4fe4b175

@ -22,8 +22,13 @@
package net.yacy.document.importer;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.date.GenericFormatter;
@ -176,12 +181,43 @@ public class OAIPMHImporter extends Thread implements Importer, Comparable<OAIPM
return 0;
}
/**
* get a map for already loaded oai-pmh servers and their latest access date
* @param surrogatesIn
* @param surrogatesOut
* @return a map where the key is the hostID of the servers and the value is the last access date
*/
@SuppressWarnings("unchecked")
public static Map<String, Date> getLoadedOAIServer(File surrogatesIn, File surrogatesOut) {
Map<String, Date> map = getLoadedOAIServer(surrogatesOut);
map.putAll((Map<? extends String, ? extends Date>) getLoadedOAIServer(surrogatesIn).entrySet());
return map;
}
private static Map<String, Date> getLoadedOAIServer(File surrogates) {
HashMap<String, Date> map = new HashMap<String, Date>();
//oaipmh_opus.bsz-bw.de_20091102113118728.xml
for (String s: surrogates.list()) {
if (s.startsWith(filenamePrefix) && s.endsWith(".xml") && s.charAt(s.length() - 22) == filenameSeparationChar) {
try {
Date fd = GenericFormatter.SHORT_MILSEC_FORMATTER.parse(s.substring(s.length() - 21, s.length() - 4), 0).getTime();
String hostID = s.substring(7, s.length() - 22);
Date md = map.get(hostID);
if (md == null || fd.after(md)) map.put(hostID, fd);
} catch (final ParseException e) {
ConcurrentLog.logException(e);
}
}
}
return map;
}
public static final char hostReplacementChar = '_';
public static final char filenameSeparationChar = '.';
public static final String filenamePrefix = "oaipmh";
/**
* compute a host id
* compute a host id that is also used in the getLoadedOAIServer method for the map key
* @param source
* @return a string that is a key for the given host
*/

@ -37,14 +37,17 @@
package net.yacy.search;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.Reader;
import java.net.MalformedURLException;
import java.security.NoSuchAlgorithmException;
@ -74,6 +77,7 @@ import java.util.concurrent.atomic.AtomicInteger;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
@ -245,7 +249,7 @@ public final class Switchboard extends serverSwitch {
public File networkRoot;
public File queuesRoot;
public File surrogatesInPath;
//public File surrogatesOutPath;
public File surrogatesOutPath;
public Segment index;
public LoaderDispatcher loader;
public CrawlSwitchboard crawler;
@ -715,14 +719,23 @@ public final class Switchboard extends serverSwitch {
SwitchboardConstants.SURROGATES_IN_PATH_DEFAULT);
this.log.info("surrogates.in Path = " + this.surrogatesInPath.getAbsolutePath());
this.surrogatesInPath.mkdirs();
/* this.surrogatesOutPath =
this.surrogatesOutPath =
getDataPath(
SwitchboardConstants.SURROGATES_OUT_PATH,
SwitchboardConstants.SURROGATES_OUT_PATH_DEFAULT);
this.log.info("surrogates.out Path = " + this.surrogatesOutPath.getAbsolutePath());
this.surrogatesOutPath.mkdirs();
*/
// copy opensearch heuristic config (if not exist)
final File osdConfig = new File(getDataPath(), "DATA/SETTINGS/heuristicopensearch.conf");
if (!osdConfig.exists()) {
final File osdDefaultConfig = new File("defaults/heuristicopensearch.conf");
this.log.info("heuristic.opensearch list Path = " + osdDefaultConfig.getAbsolutePath());
try {
Files.copy(osdDefaultConfig, osdConfig);
} catch (final IOException ex) { }
}
// create the release download directory
this.releasePath =
getDataPath(SwitchboardConstants.RELEASE_PATH, SwitchboardConstants.RELEASE_PATH_DEFAULT);
@ -1866,6 +1879,8 @@ public final class Switchboard extends serverSwitch {
if ( !infile.exists() || !infile.canWrite() || !infile.canRead() ) {
return false;
}
final File outfile = new File(this.surrogatesOutPath, s);
//if (outfile.exists()) return false;
boolean moved = false;
if ( s.endsWith("xml.zip") ) {
// open the zip file with all the xml files in it
@ -1889,7 +1904,7 @@ public final class Switchboard extends serverSwitch {
} catch (final IOException e ) {
ConcurrentLog.logException(e);
} finally {
moved = infile.delete();
moved = infile.renameTo(outfile);
if (zis != null) try {zis.close();} catch (final IOException e) {}
}
return moved;
@ -1905,7 +1920,29 @@ public final class Switchboard extends serverSwitch {
ConcurrentLog.logException(e);
} finally {
if (!shallTerminate()) {
moved = infile.delete();
moved = infile.renameTo(outfile);
if ( moved ) {
// check if this file is already compressed, if not, compress now
if ( !outfile.getName().endsWith(".gz") ) {
final String gzname = outfile.getName() + ".gz";
final File gzfile = new File(outfile.getParentFile(), gzname);
try {
final OutputStream os =
new BufferedOutputStream(new GZIPOutputStream(new FileOutputStream(gzfile)));
BufferedInputStream bis = new BufferedInputStream(new FileInputStream(outfile));
FileUtils.copy(bis, os);
os.close();
bis.close();
if ( gzfile.exists() ) {
FileUtils.deletedelete(outfile);
}
} catch (final FileNotFoundException e ) {
ConcurrentLog.logException(e);
} catch (final IOException e ) {
ConcurrentLog.logException(e);
}
}
}
}
if (is != null) try {is.close();} catch (IOException e) {}
}

@ -407,8 +407,8 @@ public final class SwitchboardConstants {
public static final String SURROGATES_IN_PATH = "surrogates.in";
public static final String SURROGATES_IN_PATH_DEFAULT = "DATA/SURROGATES/in";
//public static final String SURROGATES_OUT_PATH = "surrogates.out";
//public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out";
public static final String SURROGATES_OUT_PATH = "surrogates.out";
public static final String SURROGATES_OUT_PATH_DEFAULT = "DATA/SURROGATES/out";
public static final String DICTIONARY_SOURCE_PATH = "dictionaries";
public static final String DICTIONARY_SOURCE_PATH_DEFAULT = "DATA/DICTIONARIES";

Loading…
Cancel
Save