Commit f302ba5a authored by John Osborne's avatar John Osborne
Browse files

Major fixes, returns only hits founds in all words for now. Logging changes.

parent fdef3502
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<classpath> <classpath>
<classpathentry kind="src" path="src/test/java"/> <classpathentry kind="src" output="target/test-classes" path="src/test/java">
<classpathentry kind="src" path="src/main/java"/> <attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="src" output="target/classes" path="src/main/java">
<attributes>
<attribute name="optional" value="true"/>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8"> <classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.8">
<attributes> <attributes>
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
...@@ -12,5 +22,10 @@ ...@@ -12,5 +22,10 @@
<attribute name="maven.pomderived" value="true"/> <attribute name="maven.pomderived" value="true"/>
</attributes> </attributes>
</classpathentry> </classpathentry>
<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
<attributes>
<attribute name="maven.pomderived" value="true"/>
</attributes>
</classpathentry>
<classpathentry kind="output" path="target/classes"/> <classpathentry kind="output" path="target/classes"/>
</classpath> </classpath>
#!/bin/bash
java -cp target/test-classes:target/classes:target/umlsIndex-0.0.2-SNAPSHOT-jar-with-dependencies.jar -Dlog4j.configuration=log4j.properties edu.uab.ccts.nlp.umlsIndex.SearchClient "multiple ulcers"
package edu.uab.ccts.nlp.umlsIndex;
import java.io.File;
import java.io.IOException;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.HashMultiset;
public class SearchClient {
private IndexSearcher word2termSearcher = null;
private IndexSearcher term2conceptSearcher = null;
private QueryParser wordParser = null;
private QueryParser termParser = null;
private static final Logger LOG = LoggerFactory.getLogger(SearchClient.class);
public static void main(String[] args) throws Exception {
File f = new File(Config.UMLS_WORD2TERM_INDEX_DIR);
if(!(f.isDirectory() && f.list().length>0)) { System.err.println("Missing "+f); }
SearchClient sc = new SearchClient();
TopDocs td = sc.performSearch(sc.getWordParser(),sc.getWord2termSearcher(),args[0], 100);
ScoreDoc[] hits = td.scoreDocs;
LOG.info(args[0]+ "has " + hits.length+" word hits");
HashMultiset<String> searchsummary = sc.doIndexSearch(hits);
System.out.println("Got back "+searchsummary.elementSet().size()+" results from "+args[0]);
for(String s : searchsummary.elementSet()) {
System.out.println(s);
}
}
public SearchClient() throws IOException
{
word2termSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_WORD2TERM_INDEX_DIR))));
term2conceptSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_TERM2CONCEPT_INDEX_DIR))));
wordParser = new QueryParser("word", new StandardAnalyzer());
termParser = new QueryParser("concept", new StandardAnalyzer());
}
public HashMultiset<String> doIndexSearch(ScoreDoc[] hits) throws IOException, ParseException {
HashMultiset<String> searchsummary = HashMultiset.create();
for (int i = 0; i < hits.length; i++) {
Document hitDoc = word2termSearcher.doc(hits[i].doc);
String theword = hitDoc.get("word");
String allconcept_text = hitDoc.get("conceptText");
String[] allcons = allconcept_text.split(" ");
LOG.debug(theword+" id:"+hits[i].doc+" with score:"+hits[i].score
+" is associated with "+allcons.length+" concepts, see::"+allconcept_text);
HashMultiset<String> hitsummary = HashMultiset.create();
for(int j=0;j<allcons.length;j++) {
TopDocs topcons = performSearch(termParser,term2conceptSearcher,allcons[j], 1);
ScoreDoc[] conhits = topcons.scoreDocs;
Document conDoc = term2conceptSearcher.doc(conhits[0].doc);
String cui = conDoc.get("cui");
LOG.debug(conDoc.get("concept")+" with concept score:"+
conhits[0].score+" and cui: "+cui);
hitsummary.add(cui);
}
if(searchsummary.size()==0) searchsummary.addAll(hitsummary);
else {
searchsummary.retainAll(hitsummary);
}
}
return searchsummary;
}
public TopDocs performSearch(QueryParser qp, IndexSearcher is, String queryString, int n)
throws IOException, ParseException {
Query query = qp.parse(queryString);
return is.search(query, n);
}
public Document getDocument(int docId)
throws IOException {
return word2termSearcher.doc(docId);
}
public IndexSearcher getWord2termSearcher() {
return word2termSearcher;
}
public void setWord2termSearcher(IndexSearcher word2termSearcher) {
this.word2termSearcher = word2termSearcher;
}
public QueryParser getWordParser() {
return wordParser;
}
public void setWordParser(QueryParser wordParser) {
this.wordParser = wordParser;
}
}
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=INFO, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
...@@ -3,34 +3,26 @@ package edu.uab.ccts.nlp.umlsIndex.test.integration; ...@@ -3,34 +3,26 @@ package edu.uab.ccts.nlp.umlsIndex.test.integration;
import java.io.IOException; import java.io.IOException;
import java.io.File; import java.io.File;
import java.nio.file.Paths;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.FSDirectory;
import com.google.common.collect.HashMultiset; import com.google.common.collect.HashMultiset;
import edu.uab.ccts.nlp.umlsIndex.Config; import edu.uab.ccts.nlp.umlsIndex.Config;
import edu.uab.ccts.nlp.umlsIndex.SearchClient;
import org.junit.Assume; import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/** /**
* Unit test for simple App. * Unit test for simple App.
*/ */
public class LuceneIndexIT public class LuceneIndexIT
{ {
private IndexSearcher word2termSearcher = null;
private IndexSearcher term2conceptSearcher = null; SearchClient searchClient = null;
private QueryParser wordParser = null;
private QueryParser termParser = null; private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexIT.class);
/** /**
* Create the test case * Create the test case
* *
...@@ -38,10 +30,7 @@ public class LuceneIndexIT ...@@ -38,10 +30,7 @@ public class LuceneIndexIT
*/ */
public LuceneIndexIT() throws IOException public LuceneIndexIT() throws IOException
{ {
word2termSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_WORD2TERM_INDEX_DIR)))); searchClient = new SearchClient();
term2conceptSearcher = new IndexSearcher(DirectoryReader.open(FSDirectory.open(Paths.get(Config.UMLS_TERM2CONCEPT_INDEX_DIR))));
wordParser = new QueryParser("word", new StandardAnalyzer());
termParser = new QueryParser("concept", new StandardAnalyzer());
} }
...@@ -53,43 +42,16 @@ public class LuceneIndexIT ...@@ -53,43 +42,16 @@ public class LuceneIndexIT
File f = new File(Config.UMLS_WORD2TERM_INDEX_DIR); File f = new File(Config.UMLS_WORD2TERM_INDEX_DIR);
org.junit.Assume.assumeTrue(!(f.isDirectory() && f.list().length>0)); org.junit.Assume.assumeTrue(!(f.isDirectory() && f.list().length>0));
TopDocs td = performSearch(wordParser,word2termSearcher,"multiple ulcers", 100); TopDocs td = searchClient.performSearch(searchClient.getWordParser(),
ScoreDoc[] hits = td.scoreDocs; searchClient.getWord2termSearcher(),"multiple ulcers", 100);
System.out.println("Number of hits: " + hits.length);
HashMultiset<String> searchsummary = HashMultiset.create();
for (int i = 0; i < hits.length; i++) {
Document hitDoc = word2termSearcher.doc(hits[i].doc);
String theword = hitDoc.get("word");
String allconcept_text = hitDoc.get("conceptText");
String[] allcons = allconcept_text.split(" ");
System.out.println(theword+" id:"+hits[i].doc+" with score:"+hits[i].score
+" is associated with "+allcons.length+" concepts, see::"+allconcept_text);
HashMultiset<String> hitsummary = HashMultiset.create(); ScoreDoc[] hits = td.scoreDocs;
for(int j=0;j<allcons.length;j++) { LOG.info("Number of hits: " + hits.length);
TopDocs topcons = performSearch(termParser,term2conceptSearcher,allcons[j], 1); HashMultiset<String> searchsummary = searchClient.doIndexSearch(hits);
ScoreDoc[] conhits = topcons.scoreDocs; assert(searchsummary.contains("C1265815"));
Document conDoc = term2conceptSearcher.doc(conhits[0].doc);
String cui = conDoc.get("cui");
System.out.println(conDoc.get("concept")+" with concept score:"+
conhits[0].score+" and cui: "+cui);
hitsummary.add(cui);
}
searchsummary.addAll(hitsummary);
}
assert(searchsummary.contains("C1265815"));
} }
public TopDocs performSearch(QueryParser qp, IndexSearcher is, String queryString, int n)
throws IOException, ParseException {
Query query = qp.parse(queryString);
return is.search(query, n);
}
public Document getDocument(int docId)
throws IOException {
return word2termSearcher.doc(docId);
}
} }
#log4j.rootLogger=DEBUG
#log4j.rootLogger=DEBUG, file
log4j.rootLogger=INFO, file
#log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=log/umlsIndex.log
log4j.appender.file.MaxFileSize=100MB
log4j.appender.file.MaxBackupIndex=10
#log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment