Commit 507e1234 authored by John David Osborne's avatar John David Osborne
Browse files

Updates to use 2 different indexes, one for words in a concept name, one...

Updates to use 2 different indexes, one for words in a concept name, one mapping concept name to a CUI
parent 6d57a2b5
......@@ -36,6 +36,12 @@
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>com.google.guava</groupId>
<artifactId>guava</artifactId>
<version>18.0</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
......
......@@ -3,4 +3,7 @@ package edu.uab.ccts.nlp.umlsIndex;
public class Config {
public static final String UMLS_CONCEPT_RETRIEVE_SQL_PATH = "sql/oracle/umlsConceptDefSelect.sql";
public static final String UMLS_LUCENE_INDEX_DIR = "target/index.lucene";
public static final String UMLS_WORD2TERM_INDEX_DIR = "target/word2term.lucene";
public static final String UMLS_TERM2CONCEPT_INDEX_DIR = "target/term2concept.lucene";
}
......@@ -6,17 +6,29 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringReader;
import java.net.URISyntaxException;
import java.nio.file.Files;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
......@@ -26,10 +38,11 @@ import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class UmlsIndexWriter {
private static final Logger LOG = LoggerFactory.getLogger(UmlsIndexWriter.class);
private String jdbcConnectString;
/**
* Write a simple UMLS Indexer
* @param args
......@@ -38,7 +51,7 @@ public class UmlsIndexWriter {
UmlsIndexWriter uiw = new UmlsIndexWriter(args[0]);
uiw.buildIndex();
}
public UmlsIndexWriter(String umlsDbString) { this.jdbcConnectString = umlsDbString; }
public void buildIndex(){
......@@ -47,18 +60,49 @@ public class UmlsIndexWriter {
try(Connection con = DriverManager.getConnection(jdbcConnectString)){
Statement st = con.createStatement();
ResultSet rs = st.executeQuery(fetchsql);
//StandardAnalyzer converts to lowercase and removes stop words
StandardAnalyzer analyzer = new StandardAnalyzer();
FSDirectory index = FSDirectory.open(Paths.get("target/index.lucene"));
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter w = new IndexWriter(index, config);
FSDirectory wordIndex = FSDirectory.open(Paths.get(Config.UMLS_WORD2TERM_INDEX_DIR));
FSDirectory termIndex = FSDirectory.open(Paths.get(Config.UMLS_TERM2CONCEPT_INDEX_DIR));
IndexWriter termW = new IndexWriter(wordIndex, config);
IndexWriter conW = new IndexWriter(termIndex, config);
//Foreach concept
while(rs.next()){
String cui = rs.getString(1);
String termText = rs.getString(2);
String semanticType = rs.getString(3);
addDoc(w, cui,termText,semanticType);
String[] termnames = rs.getString(2).split("__");
String[] stypes = rs.getString(3).split("__");
HashSet<String> stset = new HashSet<String>(Arrays.asList(stypes));
StringBuilder commaSTs=new StringBuilder();
for(String ststring : stset) {
commaSTs.append(ststring); commaSTs.append(",");
}
commaSTs.deleteCharAt(commaSTs.length()-1);
HashSet<String> termset = new HashSet<String>(Arrays.asList(termnames));
//Foreach term/synonym
for(String tnames : termset) {
System.out.println("Dealing with termname:"+tnames);
//Tokenize and underscore to reflect what Lucene does and
List<String> tokens = tokenizeString(analyzer, tnames);
StringBuilder sb = new StringBuilder();
for(int i=0;i<tokens.size();i++){
String tok = tokens.get(i);
System.out.println("Dealing with tokens:"+tok);
if(i<tokens.size()-1) { sb.append(tok); sb.append("_"); }
else sb.append(tok);
}
String officialLuceneTerm = sb.toString();
addConceptDoc(conW, cui,officialLuceneTerm,commaSTs.toString());
List<String> nostops = dropStopWords(tokens);
List<String> words = stemWords(nostops);
addTermDoc(termW, officialLuceneTerm,nostops,words);
}
rs.next();
}
w.close();
wordIndex.close();
termIndex.close();
rs.close();
} catch (Exception e) { e.printStackTrace(); }
}
......@@ -102,20 +146,82 @@ although I suppose fuzzy query should work as well.
String fields are useful for facets and filter queries or display.
Text fields are useful for keyword search.
*
* @param w
* @param title
* @param isbn
* @param conceptUnderscoredText
* @param nostops
* @param tokenized
* @throws IOException
*/
private void addDoc(IndexWriter w, String cui, String raw_text, String semantic_type) throws IOException {
private void addTermDoc(IndexWriter w, String conceptUnderscoredText, List<String> nostops, List<String>tokenized) throws IOException {
Document doc = new Document();
String stemmedTerms = stemText(raw_text);
doc.add(new TextField("stemmedTerms", stemmedTerms, Field.Store.YES));
doc.add(new StringField("cui", cui, Field.Store.YES));
doc.add(new StringField("sty", semantic_type, Field.Store.YES));
for(String tword : nostops) {
doc.add(new TextField("word", tword, Field.Store.YES));
StoredField strField = new StoredField("conceptText", conceptUnderscoredText);
doc.add(strField);
w.addDocument(doc);
}
}
private void addConceptDoc(IndexWriter w, String cui, String conceptUnderscoredText, String semantic_type) throws IOException {
Document doc = new Document();
doc.add(new TextField("concept", conceptUnderscoredText, Field.Store.YES));
StoredField strField = new StoredField("cui", cui);
doc.add(strField);
//doc.add(new StringField("sty", semantic_type, Field.Store.YES));
StoredField styField = new StoredField("sty", semantic_type);
doc.add(styField);
System.out.println("Adding to concepts:"+conceptUnderscoredText+" with cui:"+cui+" with types:"+semantic_type);
w.addDocument(doc);
}
private String stemText(String raw_text) { return raw_text; }
public static List<String> tokenizeString(Analyzer analyzer, String string) {
List<String> result = new ArrayList<String>();
try (TokenStream stream = analyzer.tokenStream(null, new StringReader(string))){
stream.reset();
while (stream.incrementToken()) {
result.add(stream.getAttribute(CharTermAttribute.class).toString());
}
stream.end();
} catch (IOException e) {
// not thrown b/c we're using a string reader...
throw new RuntimeException(e);
}
return result;
}
/**
* May not be needed depending on which stopwords Lucenes uses (FIXME)
* @param allWords
* @return
* @throws URISyntaxException
*/
public List<String> dropStopWords(List<String> allWords) throws URISyntaxException{
List<String> stops = new ArrayList<String>();
try (Stream<String> stream = Files.lines(Paths.get(getClass().getResource("/StopWords.txt").toURI()))) {
stops = stream
.filter(line -> !line.startsWith("#"))
.collect(Collectors.toList());
} catch (IOException e) {
e.printStackTrace();
}
allWords.removeAll(stops);
return allWords;
}
/**
* FIXME - Stem words, use either CTAKES or Noble Porter Stemmer?
* @param allWords
* @return
*/
private List<String> stemWords(List<String> allWords){
return new ArrayList<String>(allWords);
}
}
SELECT
thetable.thecui,
minconstr || ' ' || maxconstr || ' ' || modeconstr,
mintui||' '||maxtui||' '||modetui
minconstr || '__' || maxconstr || '__' || modeconstr,
mintui||'__'||maxtui||'__'||modetui
-- || ' ' || modecondef
FROM
(
......@@ -20,5 +20,6 @@ WHERE mrconso.LAT='ENG'
-- AND mrconso.stt = 'PF'
-- AND mrconso.ispref = 'Y'
-- AND tui IN ('T046','T047')
-- AND mrconso.cui='C0814136'
GROUP BY mrconso.cui
) thetable
......@@ -42,13 +42,13 @@ public class LuceneIndexIT
@org.junit.Test
public void testIndex() throws Exception
{
TopDocs td = performSearch("cancer", 2);
TopDocs td = performSearch("cancer", 8);
ScoreDoc[] hits = td.scoreDocs;
System.out.println("Number of hits: " + hits.length);
for (int i = 0; i < hits.length; i++) {
Document hitDoc = searcher.doc(hits[i].doc);
System.out.println(hitDoc.get("stemmedTerms"));
System.out.println(hitDoc.get("cui"));
System.out.println(hitDoc.get("cui")+" with score:"+hits[i].score);
System.out.println(hitDoc.get("sty"));
//assertEquals("C0814136", hitDoc.get("cui"));
}
......
package edu.uab.ccts.nlp.umlsIndex.test.unit;
import static org.junit.Assert.*;
import java.net.URISyntaxException;
import java.util.ArrayList;
import java.util.List;
import org.junit.Test;
import edu.uab.ccts.nlp.umlsIndex.UmlsIndexWriter;
public class TestUmlsIndexWriter {
@Test
public void testDropStopWords() throws URISyntaxException{
List<String> testlist = new ArrayList<String>();
testlist.add("and");
testlist.add("cancer");
assertEquals(2,testlist.size());
UmlsIndexWriter uiw = new UmlsIndexWriter("fake");
List<String> fixed = uiw.dropStopWords(testlist);
assertEquals(1,fixed.size());
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment