- added parsing of robots meta-tag in html headers to detect a noindexing request

- added evaluation and indexing prevention in case that a noindexing is given in a html file

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6709 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 15 years ago
parent f336ed568d
commit 54af9e6b49

@ -1677,8 +1677,15 @@ public final class Switchboard extends serverSwitch {
public indexingQueueEntry condenseDocument(final indexingQueueEntry in) {
in.queueEntry.updateStatus(Response.QUEUE_STATE_CONDENSING);
// debug
if (log.isFinest()) log.logFinest("CONDENSE "+ in.queueEntry.toString());
if (in.document.indexingDenied()) {
if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': denied by document-attached noindexing rule");
return new indexingQueueEntry(in.process, in.queueEntry, in.document, null);
}
if (!in.queueEntry.profile().indexText() && !in.queueEntry.profile().indexMedia()) {
if (log.isInfo()) log.logInfo("Not Condensed Resource '" + in.queueEntry.url().toNormalform(false, true) + "': indexing not wanted by crawl profile");
return new indexingQueueEntry(in.process, in.queueEntry, in.document, null);
}
// strip out words and generate statistics
if (this.log.isFine()) log.logFine("Condensing for '" + in.queueEntry.url().toNormalform(false, true) + "'");
@ -1719,10 +1726,22 @@ public final class Switchboard extends serverSwitch {
EventOrigin processCase = queueEntry.processCase(peers.mySeed().hash);
if (process == Segments.Process.SURROGATES) processCase = EventOrigin.SURROGATES;
if (condenser == null || document.indexingDenied()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by rule in document, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
if (!queueEntry.profile().indexText() && !queueEntry.profile().indexMedia()) {
if (this.log.isInfo()) log.logInfo("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': denied by profile rule, process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
// remove stopwords
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + queueEntry.url());
// STORE URL TO LOADED-URL-DB
// STORE WORD INDEX
URIMetadataRow newEntry = null;
try {
newEntry = indexSegments.segment(process).storeDocument(
@ -1747,13 +1766,6 @@ public final class Switchboard extends serverSwitch {
processCase // process case
);
// STORE WORD INDEX
if ((!queueEntry.profile().indexText()) && (!queueEntry.profile().indexMedia())) {
if (this.log.isFine()) log.logFine("Not Indexed Resource '" + queueEntry.url().toNormalform(false, true) + "': process case=" + processCase);
addURLtoErrorDB(queueEntry.url(), referrerURL.hash(), queueEntry.initiator(), dc_title, "unknown indexing process case" + processCase);
return;
}
// increment number of indexed urls
indexedPages++;

@ -131,8 +131,8 @@ public class WebStructureGraph {
Base64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
Base64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
Base64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
Base64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
Base64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
Base64Order.enhancedCoder.encodeLongSmart((condenser == null) ? 0 : condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
Base64Order.enhancedCoder.encodeLongSmart((condenser == null) ? 0 : condenser.words().size(), 3) + // count of all unique words
Base64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
//crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);

@ -240,4 +240,5 @@ public abstract class AbstractParser implements Idiom {
public int hashCode() {
return this.getName().hashCode();
}
}

@ -76,11 +76,13 @@ public class Document {
private InputStream textStream;
private int inboundLinks, outboundLinks; // counters for inbound and outbound links, are counted after calling notifyWebStructure
private Set<String> languages;
private boolean indexingDenied;
protected Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
final Object text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images,
boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
@ -100,6 +102,7 @@ public class Document {
this.inboundLinks = -1;
this.outboundLinks = -1;
this.languages = languages;
this.indexingDenied = indexingDenied;
if (text == null) try {
this.text = new CachedFileOutputStream(Idiom.MAX_KEEP_IN_MEMORY_SIZE);
@ -111,31 +114,6 @@ public class Document {
}
}
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages) {
this(location, mimeType, charset, languages, null, null, null, null, null, (Object)null, null, null);
}
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final byte[] text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final File text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,
final String[] keywords, final String title, final String author,
final String[] sections, final String abstrct,
final CachedFileOutputStream text, final Map<DigestURI, String> anchors, final HashMap<String, ImageEntry> images) {
this(location, mimeType, charset, languages, keywords, title, author, sections, abstrct, (Object)text, anchors, images);
}
public void setInboundLinks(int il) {
this.inboundLinks = il;
}
@ -560,6 +538,10 @@ dc_rights
return (this.outboundLinks < 0) ? 0 : this.outboundLinks;
}
public boolean indexingDenied() {
return this.indexingDenied;
}
public void writeXML(OutputStreamWriter os, Date date) throws IOException {
os.write("<record>\n");
String title = this.dc_title();

@ -231,7 +231,8 @@ public class DCEntry extends TreeMap<String, String> {
"",
getDescription().getBytes("UTF-8"),
null,
null);
null,
false);
} catch (UnsupportedEncodingException e) {
Log.logException(e);
return null;

@ -94,7 +94,8 @@ public class csvParser extends AbstractParser implements Idiom {
null,
sb.toString().getBytes(charset),
null,
null);
null,
false);
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in csvParser, getBytes: " + e.getMessage(), location);
}

@ -109,7 +109,8 @@ public class docParser extends AbstractParser implements Idiom {
null,
contents.toString().getBytes("UTF-8"),
null,
null);
null,
false);
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in docParser, getBytes: " + e.getMessage(), location);
}

@ -357,6 +357,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
<meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
*/
public boolean indexingDenied() {
String s = metas.get("robots");
if (s == null) return false;
if (s.indexOf("noindex") >= 0) return true;
return false;
}
public String getDescription() {
String s = metas.get("description");
if (s == null) s = metas.get("dc.description");

@ -152,7 +152,8 @@ public class htmlParser extends AbstractParser implements Idiom {
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
scraper.getImages(),
scraper.indexingDenied());
//scraper.close();
ppd.setFavicon(scraper.getFavicon());
return ppd;
@ -242,4 +243,7 @@ public class htmlParser extends AbstractParser implements Idiom {
return SUPPORTED_EXTENSIONS;
}
public boolean indexingDenied() {
return false;
}
}

@ -147,7 +147,8 @@ public class bmpParser extends AbstractParser implements Idiom {
"", // description
sb.toString().getBytes(), // content text
anchors, // anchors
images); // images
images,
false); // images
}
public static IMAGEMAP parse(final byte[] source) {

@ -135,7 +135,8 @@ public class genericImageParser extends AbstractParser implements Idiom {
"", // description
sb.toString().getBytes(), // content text
anchors, // anchors
images); // images
images,
false); // images
}
/*
* Document(final DigestURI location, final String mimeType, final String charset, final Set<String> languages,

@ -201,7 +201,8 @@ public class odtParser extends AbstractParser implements Idiom {
docDescription,
contentBytes,
null,
null);
null,
false);
} else {
theDoc = new Document(
location,
@ -215,7 +216,8 @@ public class odtParser extends AbstractParser implements Idiom {
docDescription,
writerFile,
null,
null);
null,
false);
}
return theDoc;
} catch (final Exception e) {

@ -188,7 +188,8 @@ public class ooxmlParser extends AbstractParser implements Idiom {
docDescription,
contentBytes,
null,
null);
null,
false);
} else {
theDoc = new Document(
location,
@ -202,7 +203,8 @@ public class ooxmlParser extends AbstractParser implements Idiom {
docDescription,
writerFile,
null,
null);
null,
false);
}
return theDoc;
} catch (final Exception e) {

@ -182,7 +182,8 @@ public class pdfParser extends AbstractParser implements Idiom {
null,
contentBytes,
null,
null);
null,
false);
} else {
theDoc = new Document(
location,
@ -196,7 +197,8 @@ public class pdfParser extends AbstractParser implements Idiom {
null,
writerFile,
null,
null);
null,
false);
}
return theDoc;

@ -105,7 +105,8 @@ public class pptParser extends AbstractParser implements Idiom {
null,
contents.getBytes("UTF-8"),
null,
null);
null,
false);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -131,7 +131,8 @@ public class psParser extends AbstractParser implements Idiom {
null,
outputFile,
null,
null);
null,
false);
return theDoc;
} catch (final Exception e) {

@ -187,7 +187,8 @@ public class rssParser extends AbstractParser implements Idiom {
feedDescription,
text.getBytes(),
anchors,
images);
images,
false);
// close streams
try {
text.close();

@ -89,7 +89,8 @@ public class rtfParser extends AbstractParser implements Idiom {
null,
bodyText.getBytes("UTF-8"),
null,
null);
null,
false);
return theDoc;
}

@ -71,7 +71,7 @@ public class sevenzipParser extends AbstractParser implements Idiom {
public Document parse(final DigestURI location, final String mimeType, final String charset,
final IInStream source, final long maxRamSize) throws ParserException, InterruptedException {
final Document doc = new Document(location, mimeType, charset, null);
final Document doc = new Document(location, mimeType, charset, null, null, null, null, null, null, (Object)null, null, null, false);
Handler archive;
super.theLogger.logFine("opening 7zip archive...");
try {

@ -135,7 +135,8 @@ public class swfParser extends AbstractParser implements Idiom {
abstrct, // an abstract
contents.getBytes("UTF-8"), // the parsed document text
anchors, // a map of extracted anchors
null); // a treeset of image URLs
null,
false); // a treeset of image URLs
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -131,7 +131,7 @@ public class tarParser extends AbstractParser implements Idiom {
final String entryName = entry.getName();
// getting the entry file extension
final int idx = entryName.lastIndexOf(".");
final int idx = entryName.lastIndexOf('.');
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
// trying to determine the mimeType per file extension
@ -198,7 +198,8 @@ public class tarParser extends AbstractParser implements Idiom {
docAbstrct.toString(),
((ByteBuffer)docText).getBytes(),
docAnchors,
docImages);
docImages,
false);
} else {
result = new Document(
location,
@ -212,7 +213,8 @@ public class tarParser extends AbstractParser implements Idiom {
docAbstrct.toString(),
outputFile,
docAnchors,
docImages);
docImages,
false);
}
return result;

@ -122,7 +122,8 @@ public class torrentParser extends AbstractParser implements Idiom {
null,
filenames.toString().getBytes(charset),
null,
null);
null,
false);
} catch (UnsupportedEncodingException e) {
throw new ParserException("error in torrentParser, getBytes: " + e.getMessage(), location);
}

@ -235,7 +235,8 @@ public class vcfParser extends AbstractParser implements Idiom {
"vCard", // an abstract
text, // the parsed document text
anchors, // a map of extracted anchors
null); // a treeset of image URLs
null, // a treeset of image URLs
false);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -132,7 +132,8 @@ public class vsdParser extends AbstractParser implements Idiom {
abstrct, // an abstract
contents.getBytes("UTF-8"), // the parsed document text
null, // a map of extracted anchors
null); // a treeset of image URLs
null, // a treeset of image URLs
false);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -150,7 +150,8 @@ public class xlsParser extends AbstractParser implements Idiom {
null,
contents.getBytes("UTF-8"),
null,
null);
null,
false);
return theDoc;
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;

@ -118,7 +118,7 @@ public class zipParser extends AbstractParser implements Idiom {
// Get the entry name
final String entryName = entry.getName();
final int idx = entryName.lastIndexOf(".");
final int idx = entryName.lastIndexOf('.');
// getting the file extension
final String entryExt = (idx > -1) ? entryName.substring(idx+1) : "";
@ -185,7 +185,8 @@ public class zipParser extends AbstractParser implements Idiom {
docAbstrct.toString(),
((ByteBuffer)docText).getBytes(),
docAnchors,
docImages);
docImages,
false);
} else {
result = new Document(
location,
@ -199,7 +200,8 @@ public class zipParser extends AbstractParser implements Idiom {
docAbstrct.toString(),
outputFile,
docAnchors,
docImages);
docImages,
false);
}
return result;

Loading…
Cancel
Save