Commit 14d0f1a0 authored by John David Osborne's avatar John David Osborne
Browse files

Functsional term lookup, that likely performs poorly since it only uses basic...

Functsional term lookup, that likely performs poorly since it only uses basic Lucene. No accounting for word order, text coverage, etc...
parent b9396ae6
......@@ -4,6 +4,6 @@ public class Config {
public static final String UMLS_CONCEPT_RETRIEVE_SQL_PATH = "sql/oracle/umlsConceptDefSelect.sql";
public static final String UMLS_LUCENE_INDEX_DIR = "target/index.lucene";
public static final String UMLS_WORD2TERM_INDEX_DIR = "target/word2term.lucene";
public static final String UMLS_TERM2CONCEPT_INDEX_DIR = "target/term2concept.lucene";
public static final String UMLS_WORD2TERM_INDEX_DIR = "target/word2term.lucene"; //Words in TermNames
public static final String UMLS_TERM2CONCEPT_INDEX_DIR = "target/term2concept.lucene"; //Contains CUIs
}
......@@ -110,24 +110,28 @@ public class UmlsIndexWriter {
rs.next();
}
conW.close();
termIndex.close();
rs.close();
for(Iterator<String> it = word2term.keySet().iterator();it.hasNext();){
String word = it.next();
Document doc = new Document();
doc.add(new TextField("word", word, Field.Store.YES));
Set<String> conceptTexts = word2term.get(word);
StringBuilder sb = new StringBuilder();
for(String ctext : conceptTexts) {
StoredField strField = new StoredField("conceptText", ctext);
doc.add(strField);
sb.append(ctext); sb.append(" ");
}
LOG.debug("Adding document for "+word+" with "+conceptTexts.size()+" entries");
StoredField strField = new StoredField("conceptText", sb.toString());
doc.add(strField);
LOG.debug("Concept text after adding all that was:"+doc.get("conceptText"));
LOG.info("Adding document for "+word+" with "+conceptTexts.size()+" entries");
termW.addDocument(doc);
it.remove();
}
termW.close();
wordIndex.close();
termIndex.close();
rs.close();
} catch (Exception e) { e.printStackTrace(); }
}
......@@ -197,7 +201,7 @@ Text fields are useful for keyword search.
//doc.add(new StringField("sty", semantic_type, Field.Store.YES));
StoredField styField = new StoredField("sty", semantic_type);
doc.add(styField);
LOG.info("Adding to concepts:"+conceptUnderscoredText+" with cui:"+cui+" with types:"+semantic_type);
LOG.debug("Adding to concepts:"+conceptUnderscoredText+" with cui:"+cui+" with types:"+semantic_type);
w.addDocument(doc);
}
......@@ -255,7 +259,7 @@ Text fields are useful for keyword search.
if(s==null) s = new HashSet<String>();
s.add(concept);
word2term.put(tok, s);
LOG.info("Added/Updated "+tok+" with "+s.size()+" concepts");
LOG.debug("Added/Updated "+tok+" with "+s.size()+" concepts");
}
}
......
......@@ -19,7 +19,7 @@ WHERE mrconso.LAT='ENG'
-- AND mrconso.ts = 'P'
-- AND mrconso.stt = 'PF'
-- AND mrconso.ispref = 'Y'
-- AND tui IN ('T046')
AND tui IN ('T046')
-- AND tui IN ('T046','T047')
-- AND mrconso.cui='C0814136'
GROUP BY mrconso.cui
......
package edu.uab.ccts.nlp.umlsIndex.test.integration;
import static org.junit.Assert.*;
import java.io.IOException;
import java.nio.file.Paths;
......@@ -16,6 +15,10 @@ import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import com.google.common.collect.HashMultiset;
import edu.uab.ccts.nlp.umlsIndex.Config;
/**
* Unit test for simple App.
......@@ -33,8 +36,8 @@ public class LuceneIndexIT
*/
public LuceneIndexIT() throws IOException
{
word2termSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get("target/word2concept.lucene"))));
term2conceptSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get("target/term2concept.lucene"))));
word2termSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_WORD2TERM_INDEX_DIR))));
term2conceptSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_TERM2CONCEPT_INDEX_DIR))));
wordParser = new QueryParser("word", new StandardAnalyzer());
termParser = new QueryParser("concept", new StandardAnalyzer());
}
......@@ -45,24 +48,41 @@ public class LuceneIndexIT
@org.junit.Test
public void testIndex() throws Exception
{
TopDocs td = performSearch(wordParser,"mucocutaneous ulcers", 20);
TopDocs td = performSearch(wordParser,word2termSearcher,"multiple ulcers", 100);
ScoreDoc[] hits = td.scoreDocs;
System.out.println("Number of hits: " + hits.length);
HashMultiset<String> searchsummary = HashMultiset.create();
for (int i = 0; i < hits.length; i++) {
Document hitDoc = word2termSearcher.doc(hits[i].doc);
System.out.println(hitDoc.get("stemmedTerms"));
System.out.println(hitDoc.get("cui")+" with score:"+hits[i].score);
System.out.println(hitDoc.get("sty"));
//assertEquals("C0814136", hitDoc.get("cui"));
String theword = hitDoc.get("word");
String allconcept_text = hitDoc.get("conceptText");
String[] allcons = allconcept_text.split(" ");
System.out.println(theword+" id:"+hits[i].doc+" with score:"+hits[i].score
+" is associated with "+allcons.length+" concepts, see::"+allconcept_text);
HashMultiset<String> hitsummary = HashMultiset.create();
for(int j=0;j<allcons.length;j++) {
TopDocs topcons = performSearch(termParser,term2conceptSearcher,allcons[j], 1);
ScoreDoc[] conhits = topcons.scoreDocs;
Document conDoc = term2conceptSearcher.doc(conhits[0].doc);
String cui = conDoc.get("cui");
System.out.println(conDoc.get("concept")+" with concept score:"+
conhits[0].score+" and cui: "+cui);
hitsummary.add(cui);
}
searchsummary.addAll(hitsummary);
}
assert(searchsummary.contains("C1265815"));
}
public TopDocs performSearch(QueryParser qp,String queryString, int n)
public TopDocs performSearch(QueryParser qp, IndexSearcher is, String queryString, int n)
throws IOException, ParseException {
Query query = qp.parse(queryString);
return word2termSearcher.search(query, n);
return is.search(query, n);
}
public Document getDocument(int docId)
throws IOException {
return word2termSearcher.doc(docId);
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment