Commit b9396ae6 authored by John David Osborne's avatar John David Osborne
Browse files

Some memory issues still

parent df2fadd2
......@@ -17,8 +17,12 @@ import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Collectors;
import java.util.stream.Stream;
......@@ -42,6 +46,8 @@ import org.slf4j.LoggerFactory;
public class UmlsIndexWriter {
private static final Logger LOG = LoggerFactory.getLogger(UmlsIndexWriter.class);
private String jdbcConnectString;
Map<String,Set<String>> word2term = new HashMap<String,Set<String>>(500000);
/**
* Write a simple UMLS Indexer
......@@ -83,13 +89,13 @@ public class UmlsIndexWriter {
HashSet<String> termset = new HashSet<String>(Arrays.asList(termnames));
//Foreach term/synonym
for(String tnames : termset) {
System.out.println("Dealing with termname:"+tnames);
LOG.debug("Dealing with concept name:"+tnames);
//Tokenize and underscore to reflect what Lucene does and
List<String> tokens = tokenizeString(analyzer, tnames);
StringBuilder sb = new StringBuilder();
for(int i=0;i<tokens.size();i++){
String tok = tokens.get(i);
System.out.println("Dealing with tokens:"+tok);
LOG.debug("Dealing with tokens:"+tok);
if(i<tokens.size()-1) { sb.append(tok); sb.append("_"); }
else sb.append(tok);
}
......@@ -97,11 +103,28 @@ public class UmlsIndexWriter {
addConceptDoc(conW, cui,officialLuceneTerm,commaSTs.toString());
List<String> nostops = dropStopWords(tokens);
List<String> words = stemWords(nostops);
addTermDoc(termW, officialLuceneTerm,nostops,words);
//addTermDoc(termW, officialLuceneTerm,nostops);
addWord2Term(officialLuceneTerm,words);
}
rs.next();
}
conW.close();
for(Iterator<String> it = word2term.keySet().iterator();it.hasNext();){
String word = it.next();
Document doc = new Document();
doc.add(new TextField("word", word, Field.Store.YES));
Set<String> conceptTexts = word2term.get(word);
for(String ctext : conceptTexts) {
StoredField strField = new StoredField("conceptText", ctext);
doc.add(strField);
}
LOG.debug("Adding document for "+word+" with "+conceptTexts.size()+" entries");
termW.addDocument(doc);
it.remove();
}
termW.close();
wordIndex.close();
termIndex.close();
rs.close();
......@@ -154,12 +177,13 @@ Text fields are useful for keyword search.
* @param tokenized
* @throws IOException
*/
private void addTermDoc(IndexWriter w, String conceptUnderscoredText, List<String> nostops, List<String>tokenized) throws IOException {
private void addTermDoc(IndexWriter w, String conceptUnderscoredText, List<String> nostops) throws IOException {
Document doc = new Document();
for(String tword : nostops) {
doc.add(new TextField("word", tword, Field.Store.YES));
StoredField strField = new StoredField("conceptText", conceptUnderscoredText);
doc.add(strField);
LOG.info("Adding to words:"+tword+" with concept name:"+conceptUnderscoredText);
w.addDocument(doc);
}
}
......@@ -173,7 +197,7 @@ Text fields are useful for keyword search.
//doc.add(new StringField("sty", semantic_type, Field.Store.YES));
StoredField styField = new StoredField("sty", semantic_type);
doc.add(styField);
System.out.println("Adding to concepts:"+conceptUnderscoredText+" with cui:"+cui+" with types:"+semantic_type);
LOG.info("Adding to concepts:"+conceptUnderscoredText+" with cui:"+cui+" with types:"+semantic_type);
w.addDocument(doc);
}
......@@ -223,6 +247,18 @@ Text fields are useful for keyword search.
return new ArrayList<String>(allWords);
}
private void addWord2Term(String concept, List<String> tokens) {
for(String tok : tokens) {
Set<String> s = word2term.get(tok);
if(s==null) s = new HashSet<String>();
s.add(concept);
word2term.put(tok, s);
LOG.info("Added/Updated "+tok+" with "+s.size()+" concepts");
}
}
}
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=DEBUG, A1
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
......
......@@ -19,6 +19,7 @@ WHERE mrconso.LAT='ENG'
-- AND mrconso.ts = 'P'
-- AND mrconso.stt = 'PF'
-- AND mrconso.ispref = 'Y'
-- AND tui IN ('T046')
-- AND tui IN ('T046','T047')
-- AND mrconso.cui='C0814136'
GROUP BY mrconso.cui
......
......@@ -4,7 +4,6 @@ import static org.junit.Assert.*;
import java.io.IOException;
import java.nio.file.Paths;
import org.junit.Test;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
......@@ -23,8 +22,10 @@ import org.apache.lucene.store.FSDirectory;
*/
public class LuceneIndexIT
{
private IndexSearcher searcher = null;
private QueryParser parser = null;
private IndexSearcher word2termSearcher = null;
private IndexSearcher term2conceptSearcher = null;
private QueryParser wordParser = null;
private QueryParser termParser = null;
/**
* Create the test case
*
......@@ -32,8 +33,10 @@ public class LuceneIndexIT
*/
public LuceneIndexIT() throws IOException
{
searcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get("target/index.lucene"))));
parser = new QueryParser("stemmedTerms", new StandardAnalyzer());
word2termSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get("target/word2concept.lucene"))));
term2conceptSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get("target/term2concept.lucene"))));
wordParser = new QueryParser("word", new StandardAnalyzer());
termParser = new QueryParser("concept", new StandardAnalyzer());
}
......@@ -42,11 +45,11 @@ public class LuceneIndexIT
@org.junit.Test
public void testIndex() throws Exception
{
TopDocs td = performSearch("cancer", 8);
TopDocs td = performSearch(wordParser,"mucocutaneous ulcers", 20);
ScoreDoc[] hits = td.scoreDocs;
System.out.println("Number of hits: " + hits.length);
for (int i = 0; i < hits.length; i++) {
Document hitDoc = searcher.doc(hits[i].doc);
Document hitDoc = word2termSearcher.doc(hits[i].doc);
System.out.println(hitDoc.get("stemmedTerms"));
System.out.println(hitDoc.get("cui")+" with score:"+hits[i].score);
System.out.println(hitDoc.get("sty"));
......@@ -54,14 +57,14 @@ public class LuceneIndexIT
}
}
public TopDocs performSearch(String queryString, int n)
public TopDocs performSearch(QueryParser qp,String queryString, int n)
throws IOException, ParseException {
Query query = parser.parse(queryString);
return searcher.search(query, n);
Query query = qp.parse(queryString);
return word2termSearcher.search(query, n);
}
public Document getDocument(int docId)
throws IOException {
return searcher.doc(docId);
return word2termSearcher.doc(docId);
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment