fixes for crawling of smb links (file length not always available)

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7190 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 965a40b623
commit 48c0d508ac

@ -31,6 +31,7 @@ import java.util.TreeSet;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.DateFormatter;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.Formatter;
@ -108,6 +109,7 @@ public class yacysearchitem {
if (!result.url().isLocal()) try {
faviconURL = new DigestURI(result.url().getProtocol() + "://" + result.url().getHost() + ((port != -1) ? (":" + port) : "") + "/favicon.ico", null);
} catch (final MalformedURLException e1) {
Log.logException(e1);
faviconURL = null;
}
@ -124,8 +126,8 @@ public class yacysearchitem {
prop.putJSON("content_title-json", result.title());
prop.putHTML("content_link", result.urlstring());
prop.put("content_display", display);
if (isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // aquire license for favicon url loading
if (faviconURL != null && isHtml) sb.loader.loadIfNotExistBackground(faviconURL.toNormalform(true, false), 1024 * 1024 * 10);
prop.putHTML("content_faviconCode", sb.licensedURLs.aquireLicense(faviconURL)); // acquire license for favicon url loading
prop.put("content_urlhash", resulthashString);
prop.put("content_urlhexhash", yacySeed.b64Hash2hexHash(resulthashString));
prop.putHTML("content_urlname", nxTools.shortenURLString(result.urlname(), urllength));

@ -103,7 +103,12 @@ public class FileLoader {
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size = url.length();
long size;
try {
size = url.length();
} catch (Exception e) {
size = -1;
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {

@ -122,7 +122,12 @@ public class SMBLoader {
// check mime type and availability of parsers
// and also check resource size and limitation of the size
long size = url.length();
long size;
try {
size = url.length();
} catch (Exception e) {
size = -1;
}
String parserError = null;
if ((acceptOnlyParseable && (parserError = TextParser.supports(url, mime)) != null) ||
(size > maxFileSize && maxFileSize >= 0)) {

@ -121,8 +121,14 @@ public class DocumentIndex extends Segment {
if (url.isDirectory()) throw new IOException("file should be a document, not a path");
if (!url.canRead()) throw new IOException("cannot read file");
Document[] documents;
long length;
try {
documents = TextParser.parseSource(url, null, null, url.length(), url.getInputStream(null, -1));
length = url.length();
} catch (Exception e) {
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch (Exception e) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}

@ -657,6 +657,10 @@ public class yacySeed implements Cloneable {
public final boolean isOnline(final String type) {
return type.equals(yacySeed.PEERTYPE_SENIOR) || type.equals(yacySeed.PEERTYPE_PRINCIPAL);
}
public long nextLong(Random random, long n) {
return Math.abs(random.nextLong()) % n;
}
private static byte[] bestGap(final yacySeedDB seedDB) {
byte[] randomHash = randomHash();
@ -678,10 +682,8 @@ public class yacySeed implements Cloneable {
// find dht position and size of gap
long left = FlatWordPartitionScheme.std.dhtPosition(interval.substring(0, 12).getBytes(), null);
long right = FlatWordPartitionScheme.std.dhtPosition(interval.substring(12).getBytes(), null);
final long gap4 = FlatWordPartitionScheme.dhtDistance(left, right) >> 2; // a quarter of a gap
long gapx = gap4;
if (random.nextBoolean()) gapx += gap4;
if (random.nextBoolean()) gapx += gap4;
final long gap8 = FlatWordPartitionScheme.dhtDistance(left, right) >> 3; // 1/8 of a gap
long gapx = gap8 + (Math.abs(random.nextLong()) % (6 * gap8));
long gappos = (Long.MAX_VALUE - left >= gapx) ? left + gapx : (left - Long.MAX_VALUE) + gapx;
byte[] computedHash = FlatWordPartitionScheme.positionToHash(gappos);
// the computed hash is the perfect position (modulo gap4 population and gap alternatives)

@ -59,6 +59,7 @@ public class MultiProtocolURI implements Serializable {
private static final Pattern patternBackSlash = Pattern.compile("\\\\");
private static final Pattern patternAmp = Pattern.compile("&");
private static final Pattern patternMail = Pattern.compile("^[a-z]+:.*?");
//private static final Pattern patternSpace = Pattern.compile("%20");
// session id handling
private static final Collator insensitiveCollator = Collator.getInstance(Locale.US);
@ -116,6 +117,7 @@ public class MultiProtocolURI implements Serializable {
// identify protocol
assert (url != null);
url = url.trim();
//url = patternSpace.matcher(url).replaceAll(" ");
if (url.startsWith("\\\\")) {
url = "smb://" + patternBackSlash.matcher(url.substring(2)).replaceAll("/");
}
@ -857,9 +859,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().exists();
} catch (SmbException e) {
throw new IOException("SMB.exists SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.exists SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.exists MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.exists MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return false;
}
@ -869,9 +871,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().canRead();
} catch (SmbException e) {
throw new IOException("SMB.canRead SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.canRead SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.canRead MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.canRead MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return false;
}
@ -881,9 +883,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().canWrite();
} catch (SmbException e) {
throw new IOException("SMB.canWrite SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.canWrite SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.canWrite MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.canWrite MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return false;
}
@ -893,9 +895,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().isHidden();
} catch (SmbException e) {
throw new IOException("SMB.isHidden SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.isHidden SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.isHidden MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.isHidden MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return false;
}
@ -905,9 +907,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().isDirectory();
} catch (SmbException e) {
throw new IOException("SMB.isDirectory SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.isDirectory SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.isDirectory MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.isDirectory MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return false;
}
@ -917,9 +919,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().length();
} catch (SmbException e) {
throw new IOException("SMB.length SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.length SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.length MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.length MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return 0;
}
@ -929,9 +931,9 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().lastModified();
} catch (SmbException e) {
throw new IOException("SMB.lastModified SmbException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.lastModified SmbException (" + e.getMessage() + ") for " + this.toString());
} catch (MalformedURLException e) {
throw new IOException("SMB.lastModified MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.lastModified MalformedURLException (" + e.getMessage() + ") for " + this.toString());
}
return 0;
}
@ -941,7 +943,7 @@ public class MultiProtocolURI implements Serializable {
if (isSMB()) try {
return getSmbFile().getName();
} catch (MalformedURLException e) {
throw new IOException("SMB.getName MalformedURLException for " + this.toString() + ": " + e.getMessage());
throw new IOException("SMB.getName MalformedURLException (" + e.getMessage() + ") for " + this.toString() );
}
return null;
}

@ -85,8 +85,8 @@ public final class FileUtils {
* @see #copy(File source, File dest)
*/
public static long copy(final InputStream source, final OutputStream dest, final long count) throws IOException {
assert count == -1 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)";
if(count == 0) {
assert count < 0 || count > 0 : "precondition violated: count == " + count + " (nothing to copy)";
if (count == 0) {
// no bytes to copy
return 0;
}
@ -101,7 +101,7 @@ public final class FileUtils {
total += c;
if (count > 0) {
chunkSize = (int) Math.min(count-total, DEFAULT_BUFFER_SIZE);
chunkSize = (int) Math.min(count - total, DEFAULT_BUFFER_SIZE);
if (chunkSize == 0) break;
}
@ -275,7 +275,7 @@ public final class FileUtils {
return b;
}
final ByteArrayOutputStream baos = new ByteArrayOutputStream(512);
copy(source, baos, count);
copy(source, baos);
baos.close();
return baos.toByteArray();
}

Loading…
Cancel
Save