Improved synonyms memory footprint.

The idea is to avoid unnecessary String objects duplication for the same
words. Particularly efficient with the large moby thesaurus.

Memory footprint measurements with VisualVM :
 - openthesaurus_de_yacy :
 	- initial : 19 443 796 bytes
 	- after refactoring : 18 012 606 bytes

 - mobythesaurus_en_yacy :
 	- initial : 343 453 904 bytes
 	- after refactoring : 173 843 780 bytes

 - thesaurus_ru_yacy :
 	- initial : 3 800 706 bytes
 	- after refactoring : 3 466 612 bytes

 - de + en + ru : 
 	- initial : 366 603 450 bytes
 	- after refactoring : 195 015 914 bytes
pull/93/head
luccioman 8 years ago
parent 60b3adfb43
commit 568e3dde6a

@ -49,6 +49,9 @@ public class SynonymLibrary {
lib.clear();
if (!path.exists() || !path.isDirectory()) return;
final String[] files = path.list();
/* Global map of all known distinct words : thus enable reuse of the same word String instance
* appearing multiple times in different synonyms sets */
final Map<String, String> distinctWords = new HashMap<>();
for (final String f: files) {
File ff = new File(path, f);
String line;
@ -62,12 +65,20 @@ public class SynonymLibrary {
String[] words = CommonPattern.COMMA.split(line);
Set<String> synonyms = new HashSet<String>();
Set<String> keys = new HashSet<String>();
for (String s: words) {
s = s.trim();
if (s.length() < 2) continue;
String t = s.toLowerCase();
synonyms.add(t);
keys.add(t.substring(0, 2));
for (String word: words) {
word = word.trim();
if (word.length() < 2) continue;
String lowCaseWord = word.toLowerCase();
String kownWord = distinctWords.get(lowCaseWord);
if(kownWord != null) {
/* This word is already known : let's use the existing String instance from the synonyms map to gain memory space */
lowCaseWord = kownWord;
} else {
/* First encounter of this word : let's add it to the global map of known words */
distinctWords.put(lowCaseWord, lowCaseWord);
}
synonyms.add(lowCaseWord);
keys.add(lowCaseWord.substring(0, 2));
}
for (String key: keys) {
List<Set<String>> symsetlist = lib.get(key);

Loading…
Cancel
Save