diff --git a/htroot/CacheAdmin_p.java b/htroot/CacheAdmin_p.java
index 32282cb88..4692aae0b 100644
--- a/htroot/CacheAdmin_p.java
+++ b/htroot/CacheAdmin_p.java
@@ -54,6 +54,7 @@ import java.util.Map;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.htmlFilter.htmlFilterWriter;
import de.anomic.http.httpHeader;
import de.anomic.net.URL;
@@ -124,7 +125,7 @@ public class CacheAdmin_p {
info.append("TITLE:
").append(scraper.getTitle()).append("
").append("
")
.append("SECTION HEADLINES:
").append(formatTitles(document.getSectionTitles())).append("
")
.append("HREF:
").append(formatAnchor(document.getHyperlinks())).append("
")
- .append("IMAGE:
").append(formatAnchor(document.getImagelinks())).append("
")
+ .append("IMAGE:
").append(formatImageAnchor(document.getImages())).append("
")
.append("AUDIO:
").append(formatAnchor(document.getAudiolinks())).append("
")
.append("VIDEO:
").append(formatAnchor(document.getVideolinks())).append("
")
.append("APPS:
").append(formatAnchor(document.getApplinks())).append("
")
@@ -241,6 +242,18 @@ public class CacheAdmin_p {
return result.append("").toString();
}
+ private static String formatImageAnchor(TreeSet anchor) {
+ final StringBuffer result = new StringBuffer((anchor.size() + 1) * 256);
+ result.append("
").append(ie.alt()).append(" | ").append(ie.url().toNormalform()).append(" |
null
if unknown
*/
- public String getSourceCharset() {
+ public String getCharset() {
return this.charset;
}
@@ -224,12 +223,6 @@ public class plasmaParserDocument {
return anchors;
}
- public TreeSet getImages() {
- // returns all links enbedded as pictures (visible in document)
- // this resturns a htmlFilterImageEntry collection
- if (!resorted) resortLinks();
- return images;
- }
// the next three methods provide a calculated view on the getAnchors/getImages:
@@ -249,9 +242,11 @@ public class plasmaParserDocument {
return this.videolinks;
}
- public Map getImagelinks() {
+ public TreeSet getImages() {
+ // returns all links enbedded as pictures (visible in document)
+ // this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
- return this.imagelinks;
+ return images;
}
public Map getApplinks() {
@@ -275,7 +270,6 @@ public class plasmaParserDocument {
String ext = null;
i = anchors.entrySet().iterator();
hyperlinks = new HashMap();
- imagelinks = new HashMap();
videolinks = new HashMap();
audiolinks = new HashMap();
applinks = new HashMap();
@@ -301,8 +295,7 @@ public class plasmaParserDocument {
if (plasmaParser.mediaExtContains(ext)) {
// this is not a normal anchor, its a media link
if (plasmaParser.imageExtContains(ext)) {
- imagelinks.put(u, entry.getValue());
- collectedImages.add(new htmlFilterImageEntry(url, "", -1, -1));
+ collectedImages.add(new htmlFilterImageEntry(url, (String) entry.getValue(), -1, -1));
}
else if (plasmaParser.audioExtContains(ext)) audiolinks.put(u, entry.getValue());
else if (plasmaParser.videoExtContains(ext)) videolinks.put(u, entry.getValue());
@@ -316,21 +309,7 @@ public class plasmaParserDocument {
}
}
- // expand the hyperlinks:
- // we add artificial hyperlinks to the hyperlink set
- // that can be calculated from given hyperlinks and imagelinks
- hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks));
- hyperlinks.putAll(plasmaParser.allReflinks(imagelinks));
- hyperlinks.putAll(plasmaParser.allReflinks(audiolinks));
- hyperlinks.putAll(plasmaParser.allReflinks(videolinks));
- hyperlinks.putAll(plasmaParser.allReflinks(applinks));
- hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks));
- hyperlinks.putAll(plasmaParser.allSubpaths(imagelinks));
- hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks));
- hyperlinks.putAll(plasmaParser.allSubpaths(videolinks));
- hyperlinks.putAll(plasmaParser.allSubpaths(applinks));
-
- // finally add image links that we collected from the anchors to the image map
+ // add image links that we collected from the anchors to the image map
i = collectedImages.iterator();
htmlFilterImageEntry iEntry;
while (i.hasNext()) {
@@ -338,6 +317,20 @@ public class plasmaParserDocument {
if (!images.contains(iEntry)) images.add(iEntry);
}
+ // expand the hyperlinks:
+ // we add artificial hyperlinks to the hyperlink set
+ // that can be calculated from given hyperlinks and imagelinks
+ hyperlinks.putAll(plasmaParser.allReflinks(hyperlinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allReflinks(images));
+ hyperlinks.putAll(plasmaParser.allReflinks(audiolinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allReflinks(videolinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allReflinks(applinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allSubpaths(hyperlinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allSubpaths(images));
+ hyperlinks.putAll(plasmaParser.allSubpaths(audiolinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allSubpaths(videolinks.keySet()));
+ hyperlinks.putAll(plasmaParser.allSubpaths(applinks.keySet()));
+
// don't do this again
this.resorted = true;
}
diff --git a/source/de/anomic/plasma/plasmaSearchEvent.java b/source/de/anomic/plasma/plasmaSearchEvent.java
index 1d0c45ec3..a1669603a 100644
--- a/source/de/anomic/plasma/plasmaSearchEvent.java
+++ b/source/de/anomic/plasma/plasmaSearchEvent.java
@@ -403,6 +403,11 @@ public final class plasmaSearchEvent extends Thread implements Runnable {
// filter out bad results
Iterator wi = query.queryHashes.iterator();
while (wi.hasNext()) wordIndex.removeEntry((String) wi.next(), page.hash(), true);
+ } else if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (page.laudio() > 0)) acc.addResult(page, preranking);
+ else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (page.lvideo() > 0)) acc.addResult(page, preranking);
+ else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (page.limage() > 0)) acc.addResult(page, preranking);
+ else if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP) && (page.lapp() > 0)) acc.addResult(page, preranking);
} else {
acc.addResult(page, preranking);
}
diff --git a/source/de/anomic/plasma/plasmaSearchPreOrder.java b/source/de/anomic/plasma/plasmaSearchPreOrder.java
index 20d63db54..fefb3acb7 100644
--- a/source/de/anomic/plasma/plasmaSearchPreOrder.java
+++ b/source/de/anomic/plasma/plasmaSearchPreOrder.java
@@ -101,6 +101,12 @@ public final class plasmaSearchPreOrder {
iEntry = (indexRWIEntry) i.next();
if (iEntry.urlHash().length() != container.row().width(container.primarykey())) continue;
if ((!(query.constraint.equals(plasmaSearchQuery.catchall_constraint))) && (!(iEntry.flags().allOf(query.constraint)))) continue; // filter out entries that do not match the search constraint
+ if (query.contentdom != plasmaSearchQuery.CONTENTDOM_TEXT) {
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_AUDIO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasaudio)))) continue;
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_VIDEO) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasvideo)))) continue;
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_IMAGE) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasimage)))) continue;
+ if ((query.contentdom == plasmaSearchQuery.CONTENTDOM_APP ) && (!(iEntry.flags().get(plasmaCondenser.flag_cat_hasapp )))) continue;
+ }
pageAcc.put(serverCodings.encodeHex(Long.MAX_VALUE - this.ranking.preRanking(iEntry.generateNormalized(this.entryMin, this.entryMax), query.words("")), 16) + iEntry.urlHash(), iEntry);
}
}
diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java
index f318780a7..847d3f1b6 100644
--- a/source/de/anomic/plasma/plasmaSearchQuery.java
+++ b/source/de/anomic/plasma/plasmaSearchQuery.java
@@ -61,12 +61,19 @@ public final class plasmaSearchQuery {
public static final int SEARCHDOM_GLOBALDHT = 3;
public static final int SEARCHDOM_GLOBALALL = 4;
+ public static final int CONTENTDOM_TEXT = 0;
+ public static final int CONTENTDOM_IMAGE = 1;
+ public static final int CONTENTDOM_AUDIO = 2;
+ public static final int CONTENTDOM_VIDEO = 3;
+ public static final int CONTENTDOM_APP = 4;
+
public static final kelondroBitfield empty_constraint = new kelondroBitfield(4, "AAAAAA");
public static final kelondroBitfield catchall_constraint = new kelondroBitfield(4, "______");
public Set queryWords, queryHashes;
public int wantedResults;
public String prefer;
+ public int contentdom;
public long maximumTime;
public String urlMask;
public int domType;
@@ -75,13 +82,14 @@ public final class plasmaSearchQuery {
public int maxDistance;
public kelondroBitfield constraint;
- public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer,
+ public plasmaSearchQuery(Set queryWords, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask,
int domType, String domGroupName, int domMaxTargets,
kelondroBitfield constraint) {
this.queryWords = queryWords;
this.maxDistance = maxDistance;
this.prefer = prefer;
+ this.contentdom = contentdom;
this.queryHashes = plasmaCondenser.words2hashes(queryWords);
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
@@ -92,12 +100,13 @@ public final class plasmaSearchQuery {
this.constraint = constraint;
}
- public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer,
+ public plasmaSearchQuery(Set queryHashes, int maxDistance, String prefer, int contentdom,
int wantedResults, long maximumTime, String urlMask,
kelondroBitfield constraint) {
this.queryWords = null;
this.maxDistance = maxDistance;
this.prefer = prefer;
+ this.contentdom = contentdom;
this.queryHashes = queryHashes;
this.wantedResults = wantedResults;
this.maximumTime = maximumTime;
@@ -108,6 +117,15 @@ public final class plasmaSearchQuery {
this.constraint = constraint;
}
+ public static int contentdomParser(String dom) {
+ if (dom.equals("text")) return CONTENTDOM_TEXT;
+ else if (dom.equals("image")) return CONTENTDOM_IMAGE;
+ else if (dom.equals("audio")) return CONTENTDOM_AUDIO;
+ else if (dom.equals("video")) return CONTENTDOM_VIDEO;
+ else if (dom.equals("app")) return CONTENTDOM_APP;
+ return CONTENTDOM_TEXT;
+ }
+
public static Set hashes2Set(String query) {
if (query == null) return new HashSet();
final HashSet keyhashes = new HashSet(query.length() / yacySeedDB.commonHashLength);
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index b4291f3f1..d4a62654d 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1565,7 +1565,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
- plasmaCondenser condenser = new plasmaCondenser(document.getText(), document.charset);
+ plasmaCondenser condenser = new plasmaCondenser(document);
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@@ -1593,10 +1593,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaURL.language(entry.url()), // language
ioLinks[0].intValue(), // llocal
ioLinks[1].intValue(), // lother
- document.audiolinks.size(), // laudio
- document.imagelinks.size(), // limage
- document.videolinks.size(), // lvideo
- document.applinks.size() // lapp
+ document.getAudiolinks().size(), // laudio
+ document.getImages().size(), // limage
+ document.getVideolinks().size(), // lvideo
+ document.getApplinks().size() // lapp
);
/* ========================================================================
* STORE URL TO LOADED-URL-DB
@@ -1751,9 +1751,9 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
log.logInfo("*Indexed " + words + " words in URL " + entry.url() +
" [" + entry.urlHash() + "]" +
"\n\tDescription: " + docDescription +
- "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getSourceCharset() + " | " +
+ "\n\tMimeType: " + document.getMimeType() + " | Charset: " + document.getCharset() + " | " +
"Size: " + document.getTextLength() + " bytes | " +
- "Anchors: " + ((document.anchors==null)?0:document.anchors.size()) +
+ "Anchors: " + ((document.getAnchors() == null) ? 0 : document.getAnchors().size()) +
"\n\tStackingTime: " + (stackEndTime-stackStartTime) + " ms | " +
"ParsingTime: " + (parsingEndTime-parsingStartTime) + " ms | " +
"IndexingTime: " + (indexingEndTime-indexingStartTime) + " ms | " +
@@ -2239,13 +2239,10 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// parse the resource
plasmaParserDocument document = snippetCache.parseDocument(comp.url(), resourceContentLength.longValue(), resourceContent);
- // getting parsed body input stream
- InputStream docBodyInputStream = document.getText();
-
// getting word iterator
Iterator witer = null;
try {
- witer = plasmaCondenser.getWords(docBodyInputStream, document.charset);
+ witer = new plasmaCondenser(document).words();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/tools/nxTools.java b/source/de/anomic/tools/nxTools.java
index 604f36a7d..7fae537ab 100644
--- a/source/de/anomic/tools/nxTools.java
+++ b/source/de/anomic/tools/nxTools.java
@@ -165,7 +165,7 @@ public class nxTools {
e = s;
while (e < a.length) {
b = a[e];
- if ((b == 10) || (b == 13)) break;
+ if ((b == 10) || (b == 13) || (b == 0)) break;
e++;
}