added classification for control file types which shall not be loaded

but placed onto the noload-queue
pull/1/head
Michael Peter Christen 13 years ago
parent 1517a3b7b9
commit 434ee90c59

@ -356,7 +356,8 @@ public final class CrawlStacker {
entry.url().getContentDomain() == ContentDomain.APP ||
entry.url().getContentDomain() == ContentDomain.IMAGE ||
entry.url().getContentDomain() == ContentDomain.AUDIO ||
entry.url().getContentDomain() == ContentDomain.VIDEO ) {
entry.url().getContentDomain() == ContentDomain.VIDEO ||
entry.url().getContentDomain() == ContentDomain.CTRL) {
warning = this.nextQueue.noticeURL.push(NoticedURL.StackType.NOLOAD, entry);
//if (warning != null) this.log.logWarning("CrawlStacker.stackCrawl of URL " + entry.url().toNormalform(true, false) + " - not pushed: " + warning);
return null;

@ -36,6 +36,7 @@ public class Classification {
private static final Set<String> audioExtSet = new HashSet<String>();
private static final Set<String> videoExtSet = new HashSet<String>();
private static final Set<String> appsExtSet = new HashSet<String>();
private static final Set<String> ctrlExtSet = new HashSet<String>();
public enum ContentDomain {
@ -44,7 +45,8 @@ public class Classification {
IMAGE(1),
AUDIO(2),
VIDEO(3),
APP(4);
APP(4),
CTRL(5);
private final int code;
@ -63,6 +65,7 @@ public class Classification {
else if ("audio".equals(dom)) return AUDIO;
else if ("video".equals(dom)) return VIDEO;
else if ("app".equals(dom)) return APP;
else if ("ctrl".equals(dom)) return CTRL;
return TEXT;
}
@ -74,6 +77,7 @@ public class Classification {
else if (this == AUDIO) return "audio";
else if (this == VIDEO) return "video";
else if (this == APP) return "app";
else if (this == CTRL) return "ctrl";
return "text";
}
}
@ -85,12 +89,14 @@ public class Classification {
final String audio = "aac,aif,aiff,flac,m4a,m4p,mid,mp2,mp3,oga,ogg,ram,sid,wav,wma";
final String video = "3g2,3gp,3gp2,3gpp,3gpp2,3ivx,asf,asx,avi,div,divx,dv,dvx,env,f4v,flv,hdmov,m1v,m4v,m-jpeg,moov,mov,movie,mp2v,mp4,mpe,mpeg,mpg,mpg4,mv4,ogm,ogv,qt,rm,rv,vid,swf,webm,wmv";
final String image = "ai,bmp,cdr,cmx,emf,eps,gif,img,jpeg,jpg,mng,pct,pdd,pdn,pict,png,psb,psd,psp,tif,tiff,wmf";
final String ctrl = "sha1,md5,crc32,sfv";
addSet(textExtSet, text); // image formats
addSet(imageExtSet, image); // image formats
addSet(audioExtSet, audio); // audio formats
addSet(videoExtSet, video); // video formats
addSet(appsExtSet, apps); // application formats
addSet(ctrlExtSet, ctrl); // control formats
addSet(mediaExtSet, apps + "," + audio + "," + video + "," + image); // all media formats
}
@ -129,12 +135,18 @@ public class Classification {
return appsExtSet.contains(appsExt.trim().toLowerCase());
}
public static boolean isControlExtension(final String ctrlExt) {
if (ctrlExt == null) return false;
return ctrlExtSet.contains(ctrlExt.trim().toLowerCase());
}
public static ContentDomain getContentDomain(final String ext) {
if (isTextExtension(ext)) return ContentDomain.TEXT;
if (isImageExtension(ext)) return ContentDomain.IMAGE;
if (isAudioExtension(ext)) return ContentDomain.AUDIO;
if (isVideoExtension(ext)) return ContentDomain.VIDEO;
if (isApplicationExtension(ext)) return ContentDomain.APP;
if (isControlExtension(ext)) return ContentDomain.CTRL;
return ContentDomain.ALL;
}

Loading…
Cancel
Save