added author tag to indexing content

enhanced composition of title tag
TODO: insert author information for external parsers

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3488 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent 6be57983a8
commit a738b57b31

@ -238,7 +238,8 @@ public class yacysearch {
map.put("url", comp.url().toNormalform().replace(',', '|'));
map.put("title", comp.descr().replace(',', ' '));
map.put("description", ((document == null) ? comp.descr() : document.getMainLongTitle()).replace(',', ' '));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
map.put("author", ((document == null) ? "" : document.getAuthor()));
map.put("tags", ((document == null) ? "" : document.getKeywords(' ')));
yacyCore.newsPool.publishMyNews(new yacyNewsRecord("stippadd", map));
document.close();
}

@ -275,8 +275,22 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
public String getTitle() {
// construct a title string, even if the document has no title
// if there is one, return it
if (title.length() > 0) return title;
// some documents have a title tag as meta tag
String s = (String) metas.get("title");
// try to construct the title with the content of the title tag
if (title.length() > 0) {
if (s == null) {
return title;
} else {
if ((title.compareToIgnoreCase(s) == 0) || (title.indexOf(s) >= 0)) return s; else return title + ": " + s;
}
} else {
if (s != null) {
return s;
}
}
// othervise take any headline
for (int i = 0; i < 4; i++) {
@ -284,7 +298,7 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
}
// take description tag
String s = getDescription();
s = getDescription();
if (s.length() > 0) return s;
// extract headline from content
@ -336,9 +350,11 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (s == null) return ""; else return s;
}
public String getCopyright() {
String s = (String) metas.get("copyright");
if (s == null) return ""; else return s;
public String getAuthor() {
String s = (String) metas.get("author");
if (s == null) s = (String) metas.get("copyright");
if (s == null) return "";
return s;
}
public String[] getContentLanguages() {

@ -97,6 +97,7 @@ implements Parser {
replaceAll("\r"," ").
replaceAll("\t"," "),
null,
"", // TODO: AUTHOR
null,
null,
contents.getBytes("UTF-8"),

@ -173,6 +173,7 @@ public class odtParser extends AbstractParser implements Parser {
docKeywords,
docShortTitle,
docLongTitle,
"", // TODO: AUTHOR
null,
docDescription,
contentBytes,
@ -186,6 +187,7 @@ public class odtParser extends AbstractParser implements Parser {
docKeywords,
docShortTitle,
docLongTitle,
"", // TODO: AUTHOR
null,
docDescription,
writerFile,

@ -158,6 +158,7 @@ public class pdfParser extends AbstractParser implements Parser {
docKeywords,
docSubject,
docTitle,
"", // TODO: AUTHOR
null,
null,
contentBytes,
@ -171,6 +172,7 @@ public class pdfParser extends AbstractParser implements Parser {
docKeywords,
docSubject,
docTitle,
"", // TODO: AUTHOR
null,
null,
writerFile,

@ -114,6 +114,7 @@ public class pptParser extends AbstractParser implements Parser {
replaceAll("\r"," ").
replaceAll("\t"," "),
null,
"", // TODO: AUTHOR
null,
null,
contents.getBytes("UTF-8"),

@ -152,6 +152,7 @@ public class rpmParser extends AbstractParser implements Parser {
null,
name,
summary,
"", // TODO: AUTHOR
null,
description,
content.toString().getBytes("UTF-8"),

@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
null,
null,
feedTitle,
"", // TODO: AUTHOR
(String[]) feedSections.toArray(new String[feedSections.size()]),
feedDescription,
text.getBytes(),

@ -102,6 +102,7 @@ implements Parser {
replaceAll("\r"," ").
replaceAll("\t"," "),
null,
"", // TODO: AUTHOR
null,
null,
bodyText.getBytes("UTF-8"),

@ -135,6 +135,7 @@ public class swfParser extends AbstractParser implements Parser {
replaceAll("\r"," ").
replaceAll("\t"," "), //short title
longTitle, // a long document title
"", // TODO: AUTHOR
sections, // an array of section headlines
abstrct, // an abstract
contents.getBytes("UTF-8"), // the parsed document text

@ -213,6 +213,7 @@ public class tarParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
((serverByteBuffer)docText).toByteArray(),
@ -226,6 +227,7 @@ public class tarParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
outputFile,

@ -247,6 +247,7 @@ public class vcfParser extends AbstractParser implements Parser {
null, // a list of extracted keywords
null, // a short document title
parsedTitle.toString(), // a long document title
"", // TODO: AUTHOR
sections, // an array of section headlines
"vCard", // an abstract
text, // the parsed document text

@ -142,6 +142,7 @@ public class xlsParser extends AbstractParser implements Parser, HSSFListener {
replaceAll("\r"," ").
replaceAll("\t"," "),
null,
"", // TODO: AUTHOR
null,
null,
contents.getBytes("UTF-8"),

@ -197,6 +197,7 @@ public class zipParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
((serverByteBuffer)docText).toByteArray(),
@ -210,6 +211,7 @@ public class zipParser extends AbstractParser implements Parser {
docKeywords.toString().split(" |,"),
docShortTitle.toString(),
docLongTitle.toString(),
"", // TODO: AUTHOR
(String[])docSections.toArray(new String[docSections.size()]),
docAbstrct.toString(),
outputFile,

@ -154,7 +154,7 @@ public final class plasmaCondenser {
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author!
insertTextToWords(document.getAuthor(), 4, indexRWIEntryNew.flag_app_descr, wflags);
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; i < titles.length; i++) {

@ -753,11 +753,19 @@ public final class plasmaParser {
String[] sections = new String[scraper.getHeadlines(1).length + scraper.getHeadlines(2).length + scraper.getHeadlines(3).length + scraper.getHeadlines(4).length];
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()),
mimeType, charSet, scraper.getKeywords(),
scraper.getTitle(), scraper.getTitle(),
sections, scraper.getDescription(),
scraper.getText(), scraper.getAnchors(), scraper.getImages());
plasmaParserDocument ppd = new plasmaParserDocument(
new URL(location.toNormalform()),
mimeType,
charSet,
scraper.getKeywords(),
scraper.getTitle(),
scraper.getTitle(),
scraper.getAuthor(),
sections,
scraper.getDescription(),
scraper.getText(),
scraper.getAnchors(),
scraper.getImages());
//scraper.close();
return ppd;
} catch (MalformedURLException e) {

@ -66,6 +66,7 @@ public class plasmaParserDocument {
private String[] keywords; // most resources provide a keyword field
private String shortTitle; // a shortTitle mostly appears in the window header (border)
private String longTitle; // the real title of the document, commonly h1-tags
private String author; // author or copyright
private String[] sections; // if present: more titles/headlines appearing in the document
private String abstrct; // an abstract, if present: short content description
private Object text; // the clear text, all that is visible
@ -80,7 +81,7 @@ public class plasmaParserDocument {
private InputStream textStream;
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String shortTitle, String longTitle,
String[] keywords, String shortTitle, String longTitle, String author,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
this.location = location;
@ -89,6 +90,7 @@ public class plasmaParserDocument {
this.keywords = (keywords==null) ? new String[0] : keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = (text==null)?new byte[0]:text;
@ -103,7 +105,7 @@ public class plasmaParserDocument {
}
public plasmaParserDocument(URL location, String mimeType, String charset,
String[] keywords, String shortTitle, String longTitle,
String[] keywords, String shortTitle, String longTitle, String author,
String[] sections, String abstrct,
File text, Map anchors, TreeSet images) {
this.location = location;
@ -112,6 +114,7 @@ public class plasmaParserDocument {
this.keywords = (keywords==null) ? new String[0] : keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle;
this.author = (author==null)?"":author;
this.sections = (sections==null)?new String[0]:sections;
this.abstrct = (abstrct==null)?"":abstrct;
this.text = text;
@ -157,6 +160,10 @@ public class plasmaParserDocument {
if (abstrct != null) return abstrct; else return getMainLongTitle();
}
public String getAuthor() {
if (author != null) return author; else return "";
}
public InputStream getText() {
try {
if (this.text == null) return null;

@ -1645,6 +1645,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
return false;
}
if ((sbQueue.size() == 0) && ((getThread(CRAWLJOB_LOCAL_CRAWL).getJobCount() == 0))) setPerformance((int) Math.max(120, 60000 / getConfigLong(INDEX_DIST_BUSYSLEEP, 6000))); // if there is no activity, set low performance
// flush some entries from the RAM cache
if (sbQueue.size() == 0) wordIndex.flushCacheSome(); // permanent flushing only if we are not busy
wordIndex.loadedURL.flushCacheSome();
@ -2227,8 +2229,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
"", // author
"", // tags
document.getAuthor(), // author
document.getKeywords(' '), // tags
"", // ETag
docDate, // modification date
new Date(), // loaded date

@ -503,7 +503,7 @@ filterOutStopwordsFromTopwords=true
80_indexing_busysleep=50
80_indexing_memprereq=4194304
82_crawlstack_idlesleep=5000
82_crawlstack_busysleep=0
82_crawlstack_busysleep=10
82_crawlstack_memprereq=1048576
90_cleanup_idlesleep=300000
90_cleanup_busysleep=300000

Loading…
Cancel
Save