*) Modifications for dbImport functionality

- dbImporter threads are now shutdown by the switchboard on server shutdown
   - adding possibility to pause a importer thread via GUI
   - Bugfix for abort function
     See: http://www.yacy-forum.de/viewtopic.php?p=13363#13363

*) Modification of content parser configuration
   - now it's possible to configure which parsers should be enabled for the proxy,
     crawler, icap, etc. separately
   - 

*) htmlFilterContentScraper.java
   - adding regular expression to normalize URLs containing /../ and /./ parts

*) httpc.java
   - adding functionality to unzip gzipped content
   - requested by roland: should be used later to allow gzipped seed lists

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1170 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent 28ddba8813
commit 44fa94ac52

@ -30,7 +30,6 @@
</form>
<hr>
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<h3>Currently running jobs</h3>
<p>
<table border="0" cellpadding="2" cellspacing="1">
@ -44,12 +43,15 @@
<td class="small" ># URLs</td>
<td class="small" ># Word<br>Entities</td>
<td class="small" ># Word<br>Entries</td>
<td class="small" >Stop Import</td>
<td class="small" >Abort Import</td>
<td class="small" >Pause Import</td>
</tr>
#{running.jobs}#
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
<input type="hidden" name="jobNr" value="#[job_nr]#">
<tr class="TableCellLight">
<td class="small">#[path]#</td>
<td class="small"><font color="#(stopped)#red::green#(/stopped)#">#(stopped)#Finished::Running#(/stopped)#</font></td>
<td class="small"><font color="#(status)#red::green::blue#(/status)#">#(status)#Finished::Running::Paused#(/status)#</font></td>
<td class="small" align="right">#[percent]#</td>
<td class="small" align="right">#[elapsed]#</td>
<td class="small" align="right">#[estimated]#</td>
@ -59,14 +61,21 @@
<td class="small" align="rigth">#[word_entry_num]#</td>
<td class="small">
#(stopped)#::
<input type="submit" name="stopIndexDbImport" value="Stop Index Import">
<input type="hidden" name="jobNr" value="#[job_nr]#">
<input type="submit" name="stopIndexDbImport" value="Abort Import">
#(/stopped)#
</td>
</td>
<td class="small">
#(paused)#
<input type="submit" name="pauseIndexDbImport" value="Pause Import">
::
<input type="submit" name="continueIndexDbImport" value="Continue Import">
#(/paused)#
</td>
</tr>
</form>
#{/running.jobs}#
</table>
</form>
<hr>
<form action="IndexImport_p.html" method="post" enctype="multipart/form-data">
@ -86,7 +95,7 @@
#{finished.jobs}#
<tr class="TableCellLight">
<td class="small">#[path]#</td>
<td class="small"><font color="#(stopped)#red::green::red#(/stopped)#">#(stopped)#Finished::<b>Error:</b> #[errorMsg]##(/stopped)#</font></td>
<td class="small"><font color="#(status)#red::green::red#(/status)#">#(status)#Finished::<b>Error:</b> #[errorMsg]#::Paused#(/status)#</font></td>
<td class="small" align="right">#[percent]#</td>
<td class="small" align="right">#[elapsed]#</td>
<td class="small" align="right"><tt>#[wordHash]#</tt></td>

@ -102,8 +102,12 @@ public final class IndexImport_p {
plasmaDbImporter.finishedJobs.clear();
prop.put("LOCATION","");
return prop;
} else if (post.containsKey("stopIndexDbImport")) {
// getting the job nr of the thread that should be stopped
} else if (
(post.containsKey("stopIndexDbImport")) ||
(post.containsKey("pauseIndexDbImport")) ||
(post.containsKey("continueIndexDbImport"))
) {
// getting the job nr of the thread
String jobNr = (String) post.get("jobNr");
Thread[] importThreads = new Thread[plasmaDbImporter.runningJobs.activeCount()*2];
@ -112,12 +116,13 @@ public final class IndexImport_p {
for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
if (currThread.getJobNr() == Integer.valueOf(jobNr).intValue()) {
currThread.stoppIt();
try {
currThread.join();
} catch (InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
if (post.containsKey("stopIndexDbImport")) {
currThread.stoppIt();
try { currThread.join(); } catch (InterruptedException e) {e.printStackTrace();}
} else if (post.containsKey("pauseIndexDbImport")) {
currThread.pauseIt();
} else if (post.containsKey("continueIndexDbImport")) {
currThread.continueIt();
}
break;
}
@ -138,30 +143,30 @@ public final class IndexImport_p {
for (int i=0; i < activeCount; i++) {
plasmaDbImporter currThread = (plasmaDbImporter) importThreads[i];
// root path of the source db
prop.put("running.jobs_" + i + "_path", currThread.getImportRoot().toString());
// specifies if the importer is still running
prop.put("running.jobs_" + i + "_stopped", currThread.isAlive() ? 1:0);
// specifies if the importer was paused
prop.put("running.jobs_" + i + "_paused", currThread.isPaused() ? 1:0);
File importPath = currThread.getImportRoot();
String currWordHash = currThread.getCurrentWordhash();
long currWordEntryCount = currThread.getWordEntryCounter();
long currWordEntityCounter = currThread.getWordEntityCounter();
long currUrlCounter = currThread.getUrlCounter();
//long currImportDbSize = currThread.getImportWordDbSize();
long estimatedTime = currThread.getEstimatedTime();
long elapsedTime = currThread.getElapsedTime();
int jobNr = currThread.getJobNr();
int percent = currThread.getProcessingStatus();
// setting the status
prop.put("running.jobs_" + i + "_status", currThread.isPaused() ? 2 : currThread.isAlive() ? 1 : 0);
boolean isRunning = currThread.isAlive();
// other information
prop.put("running.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));
prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(currThread.getEstimatedTime()));
prop.put("running.jobs_" + i + "_wordHash", currThread.getCurrentWordhash());
prop.put("running.jobs_" + i + "_url_num", Long.toString(currThread.getUrlCounter()));
prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currThread.getWordEntityCounter()));
prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currThread.getWordEntryCounter()));
prop.put("running.jobs_" + i + "_path", importPath.toString());
prop.put("running.jobs_" + i + "_stopped", isRunning ? 1:0);
prop.put("running.jobs_" + i + "_percent", Integer.toString(percent));
prop.put("running.jobs_" + i + "_elapsed", serverDate.intervalToString(elapsedTime));
prop.put("running.jobs_" + i + "_estimated", serverDate.intervalToString(estimatedTime));
prop.put("running.jobs_" + i + "_wordHash", currWordHash);
prop.put("running.jobs_" + i + "_url_num", Long.toString(currUrlCounter));
prop.put("running.jobs_" + i + "_word_entity_num", Long.toString(currWordEntityCounter));
prop.put("running.jobs_" + i + "_word_entry_num", Long.toString(currWordEntryCount));
prop.put("running.jobs_" + i + "_stopped_job_nr", Integer.toString(jobNr));
// job number of the importer thread
prop.put("running.jobs_" + i + "_job_nr", Integer.toString(currThread.getJobNr()));
}
prop.put("running.jobs",activeCount);
@ -174,10 +179,10 @@ public final class IndexImport_p {
String error = currThread.getError();
prop.put("finished.jobs_" + i + "_path", currThread.getImportRoot().toString());
if (error != null) {
prop.put("finished.jobs_" + i + "_stopped", 2);
prop.put("finished.jobs_" + i + "_stopped_errorMsg", error);
prop.put("finished.jobs_" + i + "_status", 2);
prop.put("finished.jobs_" + i + "_status_errorMsg", error);
} else {
prop.put("finished.jobs_" + i + "_stopped", 0);
prop.put("finished.jobs_" + i + "_status", 0);
}
prop.put("finished.jobs_" + i + "_percent", Integer.toString(currThread.getProcessingStatus()));
prop.put("finished.jobs_" + i + "_elapsed", serverDate.intervalToString(currThread.getElapsedTime()));

@ -35,7 +35,8 @@ If you click on it while browsing, the currently viewed website will be inserted
</td>
</tr>
<tr>
<td><a href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?localIndexing=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();"><img src="/env/grafics/addlink.gif" border="0">&nbsp;Crawl with YaCy</a></td></tr>
<td><a href="javascript:w = window.open('http://#[host]#:#[port]#/QuickCrawlLink_p.html?localIndexing=on&amp;crawlingQ=on&amp;xdstopw=on&amp;title='+escape(document.title)+'&amp;url='+location.href,'_blank','height=150,width=500,resizable=yes,scrollbar=no,directory=no,menubar=no,location=no');w.focus();"><img src="/env/grafics/addlink.gif" border="0">&nbsp;Crawl with YaCy</a></td>
</tr>
</table>
::<!-- 1 -->

@ -79,13 +79,15 @@ Your Peer Language is: <font color="#556699">#[peerLang]#</font><br>
Peer names must not contain characters other than (a-z, A-Z, 0-9, '-', '_') and must not be longer than 80 characters.
Your Peer Language is: <font color="#556699">#[peerLang]#</font><br>
::<!-- 18 -->
<p>
The new parser settings where changed successfully.<br>
Parsing of the following mime-types was enabled:<br>
<ul>
Parsing of the following mime-types was enabled:
</p>
<table>
#{parser}#
<li><font color="#556699">#[enabledMime]#</font></li>
<tr><td><font color="#556699">#[parserMode]#</font></td><td><font color="#556699">#[enabledMime]#</font></td></tr>
#{/parser}#
</ul>
</table>
::<!-- 19 -->
Seed Upload method was changed successfully.
#(success)#::<br>You are now a principal peer.#(/success)#

@ -49,14 +49,18 @@
import java.net.InetSocketAddress;
import java.net.SocketException;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpd;
import de.anomic.http.httpdProxyHandler;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserConfig;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
@ -537,32 +541,82 @@ public class SettingsAck_p {
/*
* Parser configuration
*/
if (post.containsKey("parserSettings")) {
plasmaSwitchboard sb = (plasmaSwitchboard)env;
post.remove("parserSettings");
if (post.containsKey("parserSettings")) {
post.remove("parserSettings");
String[] enabledMimes = null;
if (post.containsKey("allParserEnabled")) {
// enable all available parsers
enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
} else {
// activate all received parsers
enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
}
Arrays.sort(enabledMimes);
HashMap newConfigList = new HashMap();
Set parserModes = plasmaParser.getParserConfigList().keySet();
StringBuffer enabledMimesTxt = new StringBuffer();
for (int i=0; i < enabledMimes.length; i++) {
enabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
// looping through all received settings
int pos;
Enumeration keyEnum = post.keys();
while (keyEnum.hasMoreElements()) {
String key = (String) keyEnum.nextElement();
if ((pos = key.indexOf(".")) != -1) {
String currParserMode = key.substring(0,pos).trim().toUpperCase();
String currMimeType = key.substring(pos+1).replaceAll("\n", "");
if (parserModes.contains(currParserMode)) {
HashSet currEnabledMimeTypes;
if (newConfigList.containsKey(currParserMode)) {
currEnabledMimeTypes = (HashSet) newConfigList.get(currParserMode);
} else {
currEnabledMimeTypes = new HashSet();
newConfigList.put(currParserMode, currEnabledMimeTypes);
}
currEnabledMimeTypes.add(currMimeType);
}
}
}
prop.put("info_parser",enabledMimes.length);
if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());
int enabledMimesCount = 0;
StringBuffer currEnabledMimesTxt = new StringBuffer();
Iterator parserModeIter = newConfigList.keySet().iterator();
while (parserModeIter.hasNext()) {
String currParserMode = (String)parserModeIter.next();
String[] enabledMimes = plasmaParser.setEnabledParserList(currParserMode, (Set)newConfigList.get(currParserMode));
Arrays.sort(enabledMimes);
currEnabledMimesTxt.setLength(0);
for (int i=0; i < enabledMimes.length; i++) {
currEnabledMimesTxt.append(enabledMimes[i]).append(",");
prop.put("info_parser_" + enabledMimesCount + "_parserMode",currParserMode);
prop.put("info_parser_" + enabledMimesCount + "_enabledMime",enabledMimes[i]);
enabledMimesCount++;
}
if (currEnabledMimesTxt.length() > 0) currEnabledMimesTxt.deleteCharAt(currEnabledMimesTxt.length()-1);
env.setConfig("parseableMimeTypes." + currParserMode,currEnabledMimesTxt.toString());
}
prop.put("info_parser",enabledMimesCount);
prop.put("info", 18);
return prop;
// plasmaSwitchboard sb = (plasmaSwitchboard)env;
//
// HashMap configList = plasmaParser.getParserConfigList();
// Iterator parserModeIter = configList.keySet().iterator();
//
// String[] enabledMimes = null;
// if (post.containsKey("allParserEnabled")) {
// // enable all available parsers
// enabledMimes = plasmaParser.setEnabledParserList(sb.parser.getAvailableParserList().keySet());
// } else {
// // activate all received parsers
// enabledMimes = plasmaParser.setEnabledParserList(post.keySet());
// }
// Arrays.sort(enabledMimes);
//
// StringBuffer enabledMimesTxt = new StringBuffer();
// for (int i=0; i < enabledMimes.length; i++) {
// enabledMimesTxt.append(enabledMimes[i]).append(",");
// prop.put("info_parser_" + i + "_enabledMime",enabledMimes[i]);
// }
// prop.put("info_parser",enabledMimes.length);
// if (enabledMimesTxt.length() > 0) enabledMimesTxt.deleteCharAt(enabledMimesTxt.length()-1);
//
// env.setConfig("parseableMimeTypes",enabledMimesTxt.toString());
//
// prop.put("info", 18);
// return prop;
}

@ -5,33 +5,41 @@ For a detailed description of the various MIME-types take a look at <a href="htt
<p>
<table border="0" cellpadding="2" cellspacing="1">
<tr class="TableHeader" valign="bottom">
<td class="small" >Activate</td>
#{parserMode}#
<td class="small" >#[name]#</td>
#{/parserMode}#
<td class="small" >Mime-Type</td>
<td class="small" >Parser&nbsp;Usage</td>
<td class="small" ></td>
</tr>
#{parser}#
<tr class="TableCellDark">
<td colspan="2">#[name]# V#[version]#</td>
<td colspan="#[colspan]#"><nobr>#[name]# V#[version]#<nobr></td>
<td>#[usage]#</td>
<td>&nbsp;</td>
</tr>
#{mime}#
<tr class="TableCellLight">
<td class="small" align="center"><input type="checkbox" name="#[mimetype]#" align="top" #(status)#::checked#(/status)#></td>
#{parserMode}#
<td class="small" align="center"><input type="checkbox" name="#[optionName]#" align="top" #(status)#::checked#(/status)#></td>
#{/parserMode}#
<td class="small">#[mimetype]#</td>
<td class="small">&nbsp;</td>
<td class="small" width="100%"></td>
</tr>
#{/mime}#
#{/parser}#
<!--
<tr class="TableCellDark">
<td class="small" align="center"><input type="checkbox" name="allParserEnabled" align="top" #(allParserEnabled)#::checked#(/allParserEnabled)#>
#{parserMode}#
<td class="small" align="center"><input type="checkbox" name="#[name]#.allParserEnabled" align="top" #(allParserEnabled)#::checked#(/allParserEnabled)#>
#{/parserMode}#
<td colspan="2" class="small" >Enable all parsers</td>
<td class="small">&nbsp;</td>
</tr>
-->
<tr class="TableCellDark">
<td colspan="4" class="small" ><input type="submit" name="parserSettings" value="submit">&nbsp;Changes take effect immediately</td>
<td colspan="#[parser.colspan]#" class="small" ><input type="submit" name="parserSettings" value="submit">&nbsp;Changes take effect immediately</td>
</tr>
</table>
</fieldset>

@ -49,6 +49,8 @@ import java.util.HashSet;
import java.util.Iterator;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaParser;
import de.anomic.plasma.plasmaParserConfig;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.parser.ParserInfo;
import de.anomic.server.serverObjects;
@ -248,7 +250,10 @@ public final class Settings_p {
* Parser Configuration
*/
plasmaSwitchboard sb = (plasmaSwitchboard)env;
HashSet enabledParsers = sb.parser.getEnabledParserList();
HashMap configList = plasmaParser.getParserConfigList();
plasmaParserConfig[] configArray = (plasmaParserConfig[]) configList.values().toArray(new plasmaParserConfig[configList.size()]);
HashSet parserInfos = new HashSet(sb.parser.getAvailableParserList().values());
// // fetching a list of all available mimetypes
@ -258,7 +263,7 @@ public final class Settings_p {
// Collections.sort(availableParserKeys);
// loop through the mimeTypes and add it to the properties
boolean allParsersEnabled = true;
boolean[] allParsersEnabled = new boolean[configList.size()];
int parserIdx = 0;
Iterator availableParserIter = parserInfos.iterator();
@ -267,20 +272,23 @@ public final class Settings_p {
prop.put("parser_" + parserIdx + "_name", parserInfo.parserName);
prop.put("parser_" + parserIdx + "_version", parserInfo.parserVersionNr);
prop.put("parser_" + parserIdx + "_usage", Integer.toString(parserInfo.usageCount));
prop.put("parser_" + parserIdx + "_colspan",Integer.toString(configArray.length+1));
int mimeIdx = 0;
Enumeration mimeTypeIter = parserInfo.supportedMimeTypes.keys();
while (mimeTypeIter.hasMoreElements()) {
String mimeType = (String)mimeTypeIter.nextElement();
boolean parserIsEnabled = enabledParsers.contains(mimeType);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_mimetype", mimeType);
//prop.put("parser_" + parserIdx + "_name", parserName);
//prop.put("parser_" + parserIdx + "_shortname", parserName.substring(parserName.lastIndexOf(".")+1));
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_status", enabledParsers.contains(mimeType) ? 1:0);
allParsersEnabled &= parserIsEnabled;
for (int i=0; i<configArray.length; i++) {
HashSet enabledParsers = configArray[i].getEnabledParserList();
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_optionName", configArray[i].parserMode + "." + mimeType);
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode_" + i + "_status", enabledParsers.contains(mimeType) ? 1:0);
allParsersEnabled[i] &= enabledParsers.contains(mimeType);
}
prop.put("parser_" + parserIdx + "_mime_" + mimeIdx + "_parserMode",configArray.length);
mimeIdx++;
}
prop.put("parser_" + parserIdx + "_mime", mimeIdx);
@ -288,8 +296,13 @@ public final class Settings_p {
parserIdx++;
}
prop.put("allParserEnabled",allParsersEnabled ? 1:0);
for (int i=0; i<configArray.length; i++) {
prop.put("parserMode_" + i + "_name",configArray[i].parserMode);
prop.put("parserMode_" + i + "_allParserEnabled",allParsersEnabled[i] ? 1:0);
}
prop.put("parserMode",configArray.length);
prop.put("parser", parserIdx);
prop.put("parser.colspan", Integer.toString(configArray.length+3));
// return rewrite properties
return prop;

@ -51,6 +51,9 @@ import java.util.Locale;
import java.util.Map;
import java.util.Properties;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import de.anomic.server.logging.serverLog;
import de.anomic.server.serverByteBuffer;
@ -163,6 +166,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
// (this is different from previous normal forms where a '/' must not appear in root paths; here it must appear. Makes everything easier.)
int cpos = path.indexOf("#");
if (cpos >= 0) path = path.substring(0, cpos);
Pattern pathPattern = Pattern.compile("(/[^/\\.]+/)(?<!/[.]{2}/)[.]{2}(?=/)|/\\.(?=/)");
Matcher matcher = pathPattern.matcher(path);
while (matcher.find()) {
path = matcher.replaceAll("");
matcher.reset(path);
}
if (defaultPort) {
return url.getProtocol() + "://" + url.getHost() + path;
} else {

@ -40,6 +40,8 @@
package de.anomic.http;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
@ -1239,6 +1241,31 @@ do upload
);
if (a == null) return null;
// support of gzipped data (requested by roland)
if ((a.length > 1) && (((a[1] << 8) | a[0]) == GZIPInputStream.GZIP_MAGIC)) {
try {
ByteArrayInputStream byteInput = new ByteArrayInputStream(a);
ByteArrayOutputStream byteOutput = new ByteArrayOutputStream();
GZIPInputStream zippedContent = new GZIPInputStream(byteInput);
byte[] data = new byte[1024];
int read = 0;
// reading gzip file and store it uncompressed
while((read = zippedContent.read(data, 0, 1024)) != -1) {
byteOutput.write(data, 0, read);
}
zippedContent.close();
byteOutput.close();
a = byteOutput.toByteArray();
} catch (Exception e) {
if (!e.getMessage().equals("Not in GZIP format")) {
throw new IOException(e.getMessage());
}
}
}
int s = 0;
int e;
ArrayList v = new ArrayList();
@ -1872,6 +1899,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
* @see org.apache.commons.pool.PoolableObjectFactory#destroyObject(java.lang.Object)
*/
public void destroyObject(Object obj) {
assert(obj instanceof httpc): "Invalid object type added to pool.";
if (obj instanceof httpc) {
httpc theHttpc = (httpc) obj;
@ -1883,12 +1911,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
* @see org.apache.commons.pool.PoolableObjectFactory#validateObject(java.lang.Object)
*/
public boolean validateObject(Object obj) {
/*
if (obj instanceof httpc) {
httpc theHttpc = (httpc) obj;
return true;
}
*/
assert(obj instanceof httpc): "Invalid object type in pool.";
return true;
}
@ -1905,12 +1928,7 @@ final class httpcFactory implements org.apache.commons.pool.PoolableObjectFactor
*
*/
public void passivateObject(Object obj) {
//log.debug(" passivateObject..." + obj);
/*
if (obj instanceof Session) {
httpc theHttpc = (httpc) obj;
}
*/
assert(obj instanceof httpc): "Invalid object type returned to pool.";
}
}

@ -763,7 +763,7 @@ public final class httpdProxyHandler extends httpdAbstractHandler implements htt
// make a transformer
if ((!(transformer.isIdentityTransformer())) &&
((ext == null) || (!(plasmaParser.mediaExtContains(ext)))) &&
((ext == null) || (!(plasmaParser.supportedRealtimeFileExtContains(url)))) &&
((cachedResponseHeader == null) || (plasmaParser.realtimeParsableMimeTypesContains(cachedResponseHeader.mime())))) {
hfos = new htmlFilterOutputStream((chunkedOut != null) ? chunkedOut : respond, null, transformer, (ext.length() == 0));
} else {

@ -372,8 +372,7 @@ public class icapd implements serverHandler {
reader.close();
resHdrStream.close();
if ((!(plasmaParser.supportedMimeTypesContains(httpResHeader.mime()))) &&
(!(plasmaParser.supportedFileExt(httpRequestURL)))) {
if (!plasmaParser.supportedContent(plasmaParser.PARSER_MODE_ICAP, httpRequestURL, httpResHeader.mime())) {
this.log.logInfo("Wrong mimeType or fileExtension for indexing:" +
"\nMimeType: " + httpResHeader.mime() +
"\nRequest Line:" + httpRequestLine);

@ -154,6 +154,7 @@ implements Parser {
Collection subMatches = match.getSubMatches();
if ((subMatches != null) && (!subMatches.isEmpty())) {
mimeType = ((MagicMatch) subMatches.iterator().next()).getMimeType();
if ((mimeType == null)||(mimeType.length() == 0)) mimeType = match.getMimeType();
} else {
mimeType = match.getMimeType();
}

@ -375,7 +375,7 @@ public final class plasmaCrawlWorker extends Thread {
// request has been placed and result has been returned. work off response
File cacheFile = cacheManager.getCachePath(url);
try {
if (plasmaParser.supportedContent(url,res.responseHeader.mime())) {
if (plasmaParser.supportedContent(plasmaParser.PARSER_MODE_CRAWLER,url,res.responseHeader.mime())) {
if (cacheFile.isFile()) {
cacheManager.deleteFile(url);
}
@ -521,7 +521,7 @@ public final class plasmaCrawlWorker extends Thread {
}
// returning the used httpc
httpc.returnInstance(remote);
if (remote != null) httpc.returnInstance(remote);
remote = null;
// setting the retry counter to 1

@ -26,10 +26,10 @@ public class plasmaDbImporter extends Thread {
private final serverLog log;
private boolean stopped = false;
//private boolean paused = false;
private boolean paused = false;
private String wordHash = "------------";
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = wordChunkStart;
long wordChunkStart = System.currentTimeMillis(), wordChunkEnd = this.wordChunkStart;
String wordChunkStartHash = "------------", wordChunkEndHash;
private long urlCounter = 0, wordCounter = 0, entryCounter = 0;
@ -40,6 +40,74 @@ public class plasmaDbImporter extends Thread {
public void stoppIt() {
this.stopped = true;
this.continueIt();
}
public void pauseIt() {
synchronized(this) {
this.paused = true;
}
}
public void continueIt() {
synchronized(this) {
if (this.paused) {
this.paused = false;
this.notifyAll();
}
}
}
public boolean isPaused() {
synchronized(this) {
return this.paused;
}
}
/**
* Can be used to close all still running importer threads
* e.g. on server shutdown
*/
public static void close() {
/* waiting for all threads to finish */
int threadCount = runningJobs.activeCount();
Thread[] threadList = new Thread[threadCount];
threadCount = plasmaDbImporter.runningJobs.enumerate(threadList);
if (threadCount == 0) return;
serverLog log = new serverLog("DB-IMPORT");
try {
// trying to gracefull stop all still running sessions ...
log.logInfo("Signaling shutdown to " + threadCount + " remaining dbImporter threads ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
((plasmaDbImporter)currentThread).stoppIt();
}
}
// waiting a few ms for the session objects to continue processing
try { Thread.sleep(500); } catch (InterruptedException ex) {}
// interrupting all still running or pooled threads ...
log.logInfo("Sending interruption signal to " + runningJobs.activeCount() + " remaining dbImporter threads ...");
plasmaDbImporter.runningJobs.interrupt();
// we need to use a timeout here because of missing interruptable session threads ...
log.logFine("Waiting for " + runningJobs.activeCount() + " remaining dbImporter threads to finish shutdown ...");
for ( int currentThreadIdx = 0; currentThreadIdx < threadCount; currentThreadIdx++ ) {
Thread currentThread = threadList[currentThreadIdx];
if (currentThread.isAlive()) {
log.logFine("Waiting for dbImporter thread '" + currentThread.getName() + "' [" + currentThreadIdx + "] to finish shutdown.");
try { currentThread.join(500); } catch (InterruptedException ex) {}
}
}
log.logInfo("Shutdown of remaining dbImporter threads finished.");
} catch (Exception e) {
log.logSevere("Unexpected error while trying to shutdown all remaining dbImporter threads.",e);
}
}
public String getError() {
@ -94,7 +162,7 @@ public class plasmaDbImporter extends Thread {
if (theHomeUrlDB == null) throw new NullPointerException();
this.homeUrlDB = theHomeUrlDB;
if (this.homeWordIndex.getRoot().equals(importRoot)) {
if (this.homeWordIndex.getRoot().equals(this.importRoot)) {
throw new IllegalArgumentException("Import and home DB directory must not be equal");
}
@ -120,7 +188,7 @@ public class plasmaDbImporter extends Thread {
try {
importWordsDB();
} finally {
globalEnd = System.currentTimeMillis();
this.globalEnd = System.currentTimeMillis();
finishedJobs.add(this);
}
}
@ -249,6 +317,15 @@ public class plasmaDbImporter extends Thread {
}
private boolean isAborted() {
synchronized(this) {
if (this.paused) {
try {
this.wait();
}
catch (InterruptedException e){}
}
}
return (this.stopped) || Thread.currentThread().isInterrupted();
}

@ -42,7 +42,6 @@
// compile: javac -classpath lib/commons-collections.jar:lib/commons-pool-1.2.jar -sourcepath source source/de/anomic/plasma/plasmaParser.java
package de.anomic.plasma;
import java.io.BufferedInputStream;
@ -56,7 +55,6 @@ import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.util.Arrays;
import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Hashtable;
@ -81,53 +79,35 @@ import org.apache.commons.pool.KeyedPoolableObjectFactory;
import org.apache.commons.pool.impl.GenericObjectPool;
public final class plasmaParser {
public static final String PARSER_MODE_PROXY = "PROXY";
public static final String PARSER_MODE_CRAWLER = "CRAWLER";
public static final String PARSER_MODE_URLREDIRECTOR = "URLREDIRECTOR";
public static final String PARSER_MODE_ICAP = "ICAP";
public static final HashSet PARSER_MODE = new HashSet(Arrays.asList(new String[]{
PARSER_MODE_PROXY,
PARSER_MODE_CRAWLER,
PARSER_MODE_ICAP,
PARSER_MODE_URLREDIRECTOR
}));
private static final HashMap parserConfigList = new HashMap();
/**
* A list containing all installed parsers and the mimeType that they support
* @see #loadAvailableParserList()
*/
private static final Properties availableParserList = new Properties();
/**
* A list containing all enabled parsers and the mimeType that they can handle
* @see #loadEnabledParserList()
* @see #setEnabledParserList(Enumeration)
*/
private static final HashSet enabledParserList = new HashSet();
/**
* A list of file extensions that are supported by all enabled parsers
*/
private static final HashSet supportedFileExt = new HashSet();
static final Properties availableParserList = new Properties();
/**
* A list of file extensions that are supported by the html-parser and can
* be parsed in realtime.
*/
private static final HashSet supportedRealtimeFileExt = new HashSet();
/**
* A list of mimeTypes that are generic
*/
private static final HashSet genericMimeTypes = new HashSet();
static {
genericMimeTypes.add("text/plain");
genericMimeTypes.add("text/text");
genericMimeTypes.add("text/xml");
genericMimeTypes.add("application/xml");
genericMimeTypes.add("application/x-xml");
genericMimeTypes.add("application/octet-stream");
genericMimeTypes.add("application/zip");
genericMimeTypes.add("application/x-zip");
genericMimeTypes.add("application/x-zip-compressed");
genericMimeTypes.add("application/x-compress");
genericMimeTypes.add("application/x-compressed");
}
static final HashSet supportedRealtimeFileExt = new HashSet();
/**
* A list of mimeTypes that can be parsed in Realtime (on the fly)
*/
private static final HashSet realtimeParsableMimeTypes = new HashSet();
static final HashSet realtimeParsableMimeTypes = new HashSet();
private static final Properties mimeTypeLookupByFileExt = new Properties();
static {
@ -147,7 +127,7 @@ public final class plasmaParser {
* @see plasmaParserPool
* @see plasmaParserFactory
*/
private static plasmaParserPool theParserPool;
static plasmaParserPool theParserPool;
/**
* A list of media extensions that should <b>not</b> be handled by the plasmaParser
@ -209,6 +189,10 @@ public final class plasmaParser {
private serverLog theLogger = new serverLog("PARSER");
public static HashMap getParserConfigList() {
return parserConfigList;
}
/**
* This function is used to initialize the realtimeParsableMimeTypes List.
* This list contains a list of mimeTypes that can be parsed in realtime by
@ -241,17 +225,7 @@ public final class plasmaParser {
}
}
public static void initParseableMimeTypes(String enabledMimeTypes) {
HashSet mimeTypes = null;
if ((enabledMimeTypes == null) || (enabledMimeTypes.length() == 0)) {
mimeTypes = new HashSet();
} else {
String[] enabledMimeTypeList = enabledMimeTypes.split(",");
mimeTypes = new HashSet(enabledMimeTypeList.length);
for (int i = 0; i < enabledMimeTypeList.length; i++) mimeTypes.add(enabledMimeTypeList[i].toLowerCase().trim());
}
setEnabledParserList(mimeTypes);
}
public static List extString2extList(String extString) {
LinkedList extensions = new LinkedList();
@ -291,32 +265,17 @@ public final class plasmaParser {
}
}
public static boolean supportedContent(URL url, String mimeType) {
// TODO: we need some exceptions here to index URLs like this
// http://www.musicabona.com/respighi/12668/cd/index.html.fr
mimeType = getRealMimeType(mimeType);
if (mimeType.equals("text/html")) {
return supportedMimeTypesContains(mimeType);
} else {
return supportedMimeTypesContains(mimeType) && supportedFileExt(url);
}
}
public static boolean supportedRealTimeContent(URL url, String mimeType) {
return realtimeParsableMimeTypesContains(mimeType) && supportedFileExt(url);
return realtimeParsableMimeTypesContains(mimeType) && supportedRealtimeFileExtContains(url);
}
public static boolean supportedMimeTypesContains(String mimeType) {
mimeType = getRealMimeType(mimeType);
synchronized (realtimeParsableMimeTypes) {
if (realtimeParsableMimeTypes.contains(mimeType)) return true;
}
synchronized (enabledParserList) {
return enabledParserList.contains(mimeType);
}
public static boolean supportedRealtimeFileExtContains(URL url) {
String fileExt = getFileExt(url);
synchronized (supportedRealtimeFileExt) {
return supportedRealtimeFileExt.contains(fileExt);
}
}
public static String getFileExt(URL url) {
// getting the file path
@ -339,35 +298,12 @@ public final class plasmaParser {
if (p < 0) return "";
return name.substring(p + 1);
}
public static boolean supportedFileExt(URL url) {
// getting the file path
String name = getFileExt(url);
return supportedFileExtContains(name);
}
public static boolean supportedFileExtContains(String fileExt) {
if (supportedFileExt == null) return false;
if (fileExt == null) return false;
fileExt = fileExt.trim().toLowerCase();
synchronized(supportedFileExt) {
if (supportedFileExt.contains(fileExt)) return true;
}
synchronized (supportedRealtimeFileExt) {
return supportedRealtimeFileExt.contains(fileExt);
}
}
public static boolean mediaExtContains(String mediaExt) {
if (mediaExt == null) return false;
mediaExt = mediaExt.trim().toLowerCase();
synchronized (supportedFileExt) {
if (supportedFileExt.contains(mediaExt)) return false;
}
synchronized (supportedRealtimeFileExt) {
if (supportedRealtimeFileExt.contains(mediaExt)) return false;
}
@ -389,74 +325,7 @@ public final class plasmaParser {
public static String getMimeTypeByFileExt(String fileExt) {
return mimeTypeLookupByFileExt.getProperty(fileExt,"application/octet-stream");
}
public plasmaParser() {
// nothing todo here at the moment
}
public static void enableAllParsers() {
Set availableMimeTypes = availableParserList.keySet();
setEnabledParserList(availableMimeTypes);
}
public static String[] setEnabledParserList(Set mimeTypeSet) {
HashSet newEnabledParsers = new HashSet();
HashSet newSupportedFileExt = new HashSet();
if (mimeTypeSet != null) {
Iterator mimeTypes = mimeTypeSet.iterator();
while (mimeTypes.hasNext()) {
String mimeType = (String) mimeTypes.next();
if (availableParserList.containsKey(mimeType)) {
Parser theParser = null;
try {
// getting the parser
theParser = (Parser) plasmaParser.theParserPool.borrowObject(((ParserInfo)availableParserList.get(mimeType)).parserClassName);
// getting a list of mimeTypes that the parser supports
Hashtable parserSupportsMimeTypes = theParser.getSupportedMimeTypes();
if (parserSupportsMimeTypes != null) {
Object supportedExtensions = parserSupportsMimeTypes.get(mimeType);
if ((supportedExtensions != null) &&
(supportedExtensions instanceof String) &&
(((String)supportedExtensions).length() > 0)) {
String[] extArray = ((String)supportedExtensions).split(",");
newSupportedFileExt.addAll(Arrays.asList(extArray));
}
}
newEnabledParsers.add(mimeType);
} catch (Exception e) {
serverLog.logSevere("PARSER", "error in setEnabledParserList", e);
} finally {
if (theParser != null)
try { plasmaParser.theParserPool.returnObject(mimeType,theParser); } catch (Exception e) {}
}
}
}
}
synchronized (enabledParserList) {
enabledParserList.clear();
enabledParserList.addAll(newEnabledParsers);
}
synchronized (supportedFileExt) {
supportedFileExt.clear();
supportedFileExt.addAll(newSupportedFileExt);
}
return (String[])newEnabledParsers.toArray(new String[newEnabledParsers.size()]);
}
public HashSet getEnabledParserList() {
synchronized (plasmaParser.enabledParserList) {
return (HashSet) plasmaParser.enabledParserList.clone();
}
}
public Hashtable getAvailableParserList() {
return plasmaParser.availableParserList;
}
@ -556,8 +425,12 @@ public final class plasmaParser {
public void close() {
// clearing the parser list
synchronized (enabledParserList) {
enabledParserList.clear();
Iterator configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
synchronized (currentConfig.enabledParserList) {
currentConfig.enabledParserList.clear();
}
}
// closing the parser object pool
@ -659,7 +532,7 @@ public final class plasmaParser {
//e.printStackTrace();
return null;
} finally {
if ((theParser != null) && (supportedMimeTypesContains(mimeType))) {
if (theParser != null) {
try { plasmaParser.theParserPool.returnObject(mimeType, theParser); } catch (Exception e) { }
}
}
@ -693,8 +566,8 @@ public final class plasmaParser {
// determining the proper parser class name for the mimeType
String parserClassName = null;
ParserInfo parserInfo = null;
synchronized (plasmaParser.enabledParserList) {
if (plasmaParser.enabledParserList.contains(mimeType)) {
synchronized (plasmaParser.availableParserList) {
if (plasmaParser.availableParserList.contains(mimeType)) {
parserInfo = (ParserInfo)plasmaParser.availableParserList.get(mimeType);
parserClassName = parserInfo.parserClassName;
} else {
@ -815,7 +688,7 @@ public final class plasmaParser {
plasmaParser.initRealtimeParsableMimeTypes("application/xhtml+xml,text/html,text/plain");
// configure all other supported mimeTypes
plasmaParser.enableAllParsers();
plasmaParser.enableAllParsers(PARSER_MODE_PROXY);
// parsing the content
plasmaParserDocument document = theParser.parseSource(contentURL, contentMimeType, contentFile);
@ -830,6 +703,81 @@ public final class plasmaParser {
}
}
private static void enableAllParsers(String parserMode) {
if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
config.enableAllParsers();
}
public static boolean supportedContent(URL url, String mimeType) {
if (url == null) throw new NullPointerException();
Iterator configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedContent(url, mimeType)) return true;
}
}
return false;
}
public static boolean supportedContent(String parserMode, URL url, String mimeType) {
if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
if (url == null) throw new NullPointerException();
plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
return (config == null)?false:config.supportedContent(url, mimeType);
}
public static void initParseableMimeTypes(String parserMode, String configStr) {
if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
if (config == null) {
config = new plasmaParserConfig(parserMode);
parserConfigList.put(parserMode, config);
}
config.initParseableMimeTypes(configStr);
}
public static String[] setEnabledParserList(String parserMode, Set mimeTypeSet) {
if (!PARSER_MODE.contains(parserMode)) throw new IllegalArgumentException();
plasmaParserConfig config = (plasmaParserConfig) parserConfigList.get(parserMode);
if (config == null) {
config = new plasmaParserConfig(parserMode);
parserConfigList.put(parserMode, config);
}
return config.setEnabledParserList(mimeTypeSet);
}
public static boolean supportedFileExtContains(String fileExt) {
Iterator configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedFileExtContains(fileExt)) return true;
}
}
return false;
}
public static boolean supportedMimeTypesContains(String mimeType) {
Iterator configs = parserConfigList.values().iterator();
while (configs.hasNext()) {
plasmaParserConfig currentConfig = (plasmaParserConfig) configs.next();
synchronized (currentConfig.enabledParserList) {
if (currentConfig.supportedMimeTypesContains(mimeType)) return true;
}
}
return false;
}
}
final class plasmaParserFactory implements KeyedPoolableObjectFactory {
@ -920,5 +868,3 @@ final class plasmaParserPool extends GenericKeyedObjectPool {
super.returnObject(key,borrowed);
}
}

@ -196,7 +196,7 @@ public class plasmaParserDocument {
ext = url.substring(extpos).toLowerCase();
}
normal = htmlFilterContentScraper.urlNormalform(null, url);
if (normal != null) {
if (normal != null) { //TODO: extension function is not correct
if (plasmaParser.mediaExtContains(ext.substring(1))) {
// this is not an normal anchor, its a media link
medialinks.put(normal, entry.getValue());

@ -387,7 +387,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// define a realtime parsable mimetype list
log.logConfig("Parser: Initializing Mime Types");
plasmaParser.initRealtimeParsableMimeTypes(getConfig("parseableRealtimeMimeTypes","application/xhtml+xml,text/html,text/plain"));
plasmaParser.initParseableMimeTypes(getConfig("parseableMimeTypes",null));
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_PROXY,getConfig("parseableMimeTypes.PROXY",null));
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_CRAWLER,getConfig("parseableMimeTypes.CRAWLER",null));
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_ICAP,getConfig("parseableMimeTypes.ICAP",null));
plasmaParser.initParseableMimeTypes(plasmaParser.PARSER_MODE_URLREDIRECTOR,getConfig("parseableMimeTypes.URLREDIRECTOR",null));
// start a loader
log.logConfig("Starting Crawl Loader");
@ -721,7 +724,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wordIndex.close(waitingBoundSeconds);
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager");
try {
//sbStackCrawlThread.stopIt();
// closing all still running db importer jobs
plasmaDbImporter.close();
indexDistribution.close();
cacheLoader.close();
wikiDB.close();
@ -1141,7 +1146,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaParserDocument document = null;
httpHeader entryRespHeader = entry.responseHeader();
String mimeType = (entryRespHeader == null)?null:entryRespHeader.mime();
if (plasmaParser.supportedContent(entry.url(),mimeType)){
if (plasmaParser.supportedContent(
entry.url(),
mimeType)
){
if ((entry.cacheFile().exists()) && (entry.cacheFile().length() > 0)) {
log.logFine("(Parser) '" + entry.normalizedURLString() + "' is not parsed yet, parsing now from File");
document = parser.parseSource(entry.url(), mimeType, entry.cacheFile());

@ -160,7 +160,11 @@ public class urlRedirectord implements serverHandler {
// getting URL mimeType
httpHeader header = httpc.whead(reqURL, 10000, null, null, switchboard.remoteProxyConfig);
if (plasmaParser.supportedContent(reqURL,header.mime())) {
if (plasmaParser.supportedContent(
plasmaParser.PARSER_MODE_URLREDIRECTOR,
reqURL,
header.mime())
) {
// enqueuing URL for crawling
reasonString = switchboard.sbStackCrawlThread.stackCrawl(
this.nextURL,

@ -50,6 +50,14 @@ public final class yacyVersion {
if ((removedSettings == null)||(removedSettings.size() == 0)) return null;
HashMap migratedSettings = new HashMap();
if (removedSettings.containsKey("parseableMimeTypes")) {
String value = (String) removedSettings.get("parseableMimeTypes");
migratedSettings.put("parseableMimeTypes.CRAWLER", value);
migratedSettings.put("parseableMimeTypes.PROXY", value);
migratedSettings.put("parseableMimeTypes.URLREDIRECTOR", value);
migratedSettings.put("parseableMimeTypes.ICAP", value);
}
return migratedSettings;
}

@ -98,7 +98,11 @@ proxyCacheSize = 200
# parseableRealtimeMimeTypes: specifies mime-types that can be indexed on the fly
# parseableMime: specifies mime-types that can be indexed but not on the fly
parseableRealtimeMimeTypes=application/xhtml+xml,text/html,text/plain
parseableMimeTypes=
parseableMimeTypes.CRAWLER=
parseableMimeTypes.PROXY=
parseableMimeTypes.ICAP=
parseableMimeTypes.URLREDIRECTOR=
# media extension string
# a comma-separated list of extensions that denote media file formats

Loading…
Cancel
Save