fixes for crawling of smb links (file length not always available)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7190 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 965a40b623
commit 48c0d508ac

@ -31,6 +31,7 @@ import java.util.TreeSet;
import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter; import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.EventTracker; import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.Formatter; import net.yacy.kelondro.util.Formatter;
@ -108,6 +109,7 @@ public class yacysearchitem {
if (!result.url().isLocal()) try { if (!result.url().isLocal()) try {
faviconURL = new DigestURI(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico", null); faviconURL = new DigestURI(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico", null);
} catch (final MalformedURLException e1) { } catch (final MalformedURLException e1) {
Log.logException(e1);
faviconURL = null; faviconURL = null;
} }
@ -124,8 +126,8 @@ public class yacysearchitem {
prop.putJSON("content_title-json", result.title()); prop.putJSON("content_title-json", result.title());
prop.putHTML("content_link", result.urlstring()); prop.putHTML("content_link", result.urlstring());
prop.put("content_display", display); prop.put("content_display", display);
if (isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10); if (faviconURL != null && isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString); prop.put("content_urlhash", resulthashString);
prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(resulthashString)); prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(resulthashString));
prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength)); prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength));

@ -103,7 +103,12 @@ public class FileLoader {
// check mime type and availability of parsers // check mime type and availability of parsers
// and also check resource size and limitation of the size // and also check resource size and limitation of the size
long size = url.length(); long size;
try {
size = url.length();
} catch (Exception e) {
size = -1;
}
String parserError = null; String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) { (size > maxFileSize && maxFileSize >= 0)) {

@ -122,7 +122,12 @@ public class SMBLoader {
// check mime type and availability of parsers // check mime type and availability of parsers
// and also check resource size and limitation of the size // and also check resource size and limitation of the size
long size = url.length(); long size;
try {
size = url.length();
} catch (Exception e) {
size = -1;
}
String parserError = null; String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) || if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) { (size > maxFileSize && maxFileSize >= 0)) {

@ -121,8 +121,14 @@ public class DocumentIndex extends Segment {
if (url.isDirectory()) throw new IOException("file should be a document, not a path"); if (url.isDirectory()) throw new IOException("file should be a document, not a path");
if (!url.canRead()) throw new IOException("cannot read file"); if (!url.canRead()) throw new IOException("cannot read file");
Document[] documents; Document[] documents;
long length;
try { try {
documents = TextParser.parseSource(url, null, null, url.length(), url.getInputStream(null, -1)); length = url.length();
} catch (Exception e) {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch (Exception e) { } catch (Exception e) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
} }

@ -658,6 +658,10 @@ public class yacySeed implements Cloneable {
return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL); return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL);
} }
public long nextLong(Random random, long n) {
return Math.abs(random.nextLong()) % n;
}
private static byte[] bestGap(final yacySeedDB seedDB) { private static byte[] bestGap(final yacySeedDB seedDB) {
byte[] randomHash = randomHash(); byte[] randomHash = randomHash();
if ((seedDB == null) || (seedDB.sizeConnected() <= 2)) { if ((seedDB == null) || (seedDB.sizeConnected() <= 2)) {
@ -678,10 +682,8 @@ public class yacySeed implements Cloneable {
// find dht position and size of gap // find dht position and size of gap
long left = FlatWordPartitionScheme.std.dhtPosition(interval.substring(0, 12).getBytes(), null); long left = FlatWordPartitionScheme.std.dhtPosition(interval.substring(0, 12).getBytes(), null);
long right = FlatWordPartitionScheme.std.dhtPosition(interval.substring(12).getBytes(), null); long right = FlatWordPartitionScheme.std.dhtPosition(interval.substring(12).getBytes(), null);
final long gap4 = FlatWordPartitionScheme.dhtDistance(left, right) >> 2; // a quarter of a gap final long gap8 = FlatWordPartitionScheme.dhtDistance(left, right) >> 3; // 1/8 of a gap
long gapx = gap4; long gapx = gap8 + (Math.abs(random.nextLong()) % (6 * gap8));
if (random.nextBoolean()) gapx += gap4;
if (random.nextBoolean()) gapx += gap4;
long gappos = (Long.MAX_VALUE - left >= gapx) ? left + gapx : (left - Long.MAX_VALUE) + gapx; long gappos = (Long.MAX_VALUE - left >= gapx) ? left + gapx : (left - Long.MAX_VALUE) + gapx;
byte[] computedHash = FlatWordPartitionScheme.positionToHash(gappos); byte[] computedHash = FlatWordPartitionScheme.positionToHash(gappos);
// the computed hash is the perfect position (modulo gap4 population and gap alternatives) // the computed hash is the perfect position (modulo gap4 population and gap alternatives)

@ -59,6 +59,7 @@ public class MultiProtocolURI implements Serializable {
private static final Pattern patternBackSlash = Pattern.compile("\\\\"); private static final Pattern patternBackSlash = Pattern.compile("\\\\");
private static final Pattern patternAmp = Pattern.compile("&"); private static final Pattern patternAmp = Pattern.compile("&");
private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?"); private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
//private static final Pattern patternSpace = Pattern.compile("%20");
// session id handling // session id handling
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US); private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
@ -116,6 +117,7 @@ public class MultiProtocolURI implements Serializable {
// identify protocol // identify protocol
assert (url != null); assert (url != null);
url = url.trim(); url = url.trim();
//url = patternSpace.matcher(url).replaceAll(" ");
if (url.startsWith("\\\\")) { if (url.startsWith("\\\\")) {
url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/"); url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");
} }
@ -857,9 +859,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().exists(); return getSmbFile().exists();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.exists SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.exists MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return false; return false;
} }
@ -869,9 +871,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().canRead(); return getSmbFile().canRead();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.canRead SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.canRead MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return false; return false;
} }
@ -881,9 +883,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().canWrite(); return getSmbFile().canWrite();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.canWrite SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.canWrite MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return false; return false;
} }
@ -893,9 +895,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().isHidden(); return getSmbFile().isHidden();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.isHidden SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.isHidden MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return false; return false;
} }
@ -905,9 +907,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().isDirectory(); return getSmbFile().isDirectory();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.isDirectory SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.isDirectory MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return false; return false;
} }
@ -917,9 +919,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().length(); return getSmbFile().length();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.length SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.length SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.length MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.length MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return 0; return 0;
} }
@ -929,9 +931,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().lastModified(); return getSmbFile().lastModified();
} catch (SmbException e) { } catch (SmbException e) {
throw new IOException("SMB.lastModified SmbException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.lastModified MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + this.toString());
} }
return 0; return 0;
} }
@ -941,7 +943,7 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try { if (isSMB()) try {
return getSmbFile().getName(); return getSmbFile().getName();
} catch (MalformedURLException e) { } catch (MalformedURLException e) {
throw new IOException("SMB.getName MalformedURLException for " + this.toString() + ": " + e.getMessage()); throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + this.toString() );
} }
return null; return null;
} }

@ -85,8 +85,8 @@ public final class FileUtils {
* @see #copy(File source, File dest) * @see #copy(File source, File dest)
*/ */
public static long copy(final InputStream source, final OutputStream dest, final long count) throws IOException { public static long copy(final InputStream source, final OutputStream dest, final long count) throws IOException {
assert count == -1 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)"; assert count < 0 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)";
if(count == 0) { if (count == 0) {
// no bytes to copy // no bytes to copy
return 0; return 0;
} }
@ -101,7 +101,7 @@ public final class FileUtils {
total += c; total += c;
if (count > 0) { if (count > 0) {
chunkSize = (int) Math.min(count-total, DEFAULT_BUFFER_SIZE); chunkSize = (int) Math.min(count - total, DEFAULT_BUFFER_SIZE);
if (chunkSize == 0) break; if (chunkSize == 0) break;
} }
@ -275,7 +275,7 @@ public final class FileUtils {
return b; return b;
} }
final ByteArrayOutputStream baos = new ByteArrayOutputStream(512); final ByteArrayOutputStream baos = new ByteArrayOutputStream(512);
copy(source, baos, count); copy(source, baos);
baos.close(); baos.close();
return baos.toByteArray(); return baos.toByteArray();
} }

Loading…
Cancel
Save