Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
John David Osborne
umlsIndex
Commits
b9396ae6
Commit
b9396ae6
authored
Jun 05, 2016
by
John David Osborne
Browse files
Some memory issues still
parent
df2fadd2
Changes
4
Hide whitespace changes
Inline
Side-by-side
src/main/java/edu/uab/ccts/nlp/umlsIndex/UmlsIndexWriter.java
View file @
b9396ae6
...
...
@@ -17,8 +17,12 @@ import java.sql.ResultSet;
import
java.sql.Statement
;
import
java.util.ArrayList
;
import
java.util.Arrays
;
import
java.util.HashMap
;
import
java.util.HashSet
;
import
java.util.Iterator
;
import
java.util.List
;
import
java.util.Map
;
import
java.util.Set
;
import
java.util.stream.Collectors
;
import
java.util.stream.Stream
;
...
...
@@ -42,6 +46,8 @@ import org.slf4j.LoggerFactory;
public
class
UmlsIndexWriter
{
private
static
final
Logger
LOG
=
LoggerFactory
.
getLogger
(
UmlsIndexWriter
.
class
);
private
String
jdbcConnectString
;
Map
<
String
,
Set
<
String
>>
word2term
=
new
HashMap
<
String
,
Set
<
String
>>(
500000
);
/**
* Write a simple UMLS Indexer
...
...
@@ -83,13 +89,13 @@ public class UmlsIndexWriter {
HashSet
<
String
>
termset
=
new
HashSet
<
String
>(
Arrays
.
asList
(
termnames
));
//Foreach term/synonym
for
(
String
tnames
:
termset
)
{
System
.
out
.
println
(
"Dealing with
term
name:"
+
tnames
);
LOG
.
debug
(
"Dealing with
concept
name:"
+
tnames
);
//Tokenize and underscore to reflect what Lucene does and
List
<
String
>
tokens
=
tokenizeString
(
analyzer
,
tnames
);
StringBuilder
sb
=
new
StringBuilder
();
for
(
int
i
=
0
;
i
<
tokens
.
size
();
i
++){
String
tok
=
tokens
.
get
(
i
);
System
.
out
.
println
(
"Dealing with tokens:"
+
tok
);
LOG
.
debug
(
"Dealing with tokens:"
+
tok
);
if
(
i
<
tokens
.
size
()-
1
)
{
sb
.
append
(
tok
);
sb
.
append
(
"_"
);
}
else
sb
.
append
(
tok
);
}
...
...
@@ -97,11 +103,28 @@ public class UmlsIndexWriter {
addConceptDoc
(
conW
,
cui
,
officialLuceneTerm
,
commaSTs
.
toString
());
List
<
String
>
nostops
=
dropStopWords
(
tokens
);
List
<
String
>
words
=
stemWords
(
nostops
);
addTermDoc
(
termW
,
officialLuceneTerm
,
nostops
,
words
);
//addTermDoc(termW, officialLuceneTerm,nostops);
addWord2Term
(
officialLuceneTerm
,
words
);
}
rs
.
next
();
}
conW
.
close
();
for
(
Iterator
<
String
>
it
=
word2term
.
keySet
().
iterator
();
it
.
hasNext
();){
String
word
=
it
.
next
();
Document
doc
=
new
Document
();
doc
.
add
(
new
TextField
(
"word"
,
word
,
Field
.
Store
.
YES
));
Set
<
String
>
conceptTexts
=
word2term
.
get
(
word
);
for
(
String
ctext
:
conceptTexts
)
{
StoredField
strField
=
new
StoredField
(
"conceptText"
,
ctext
);
doc
.
add
(
strField
);
}
LOG
.
debug
(
"Adding document for "
+
word
+
" with "
+
conceptTexts
.
size
()+
" entries"
);
termW
.
addDocument
(
doc
);
it
.
remove
();
}
termW
.
close
();
wordIndex
.
close
();
termIndex
.
close
();
rs
.
close
();
...
...
@@ -154,12 +177,13 @@ Text fields are useful for keyword search.
* @param tokenized
* @throws IOException
*/
private
void
addTermDoc
(
IndexWriter
w
,
String
conceptUnderscoredText
,
List
<
String
>
nostops
,
List
<
String
>
tokenized
)
throws
IOException
{
private
void
addTermDoc
(
IndexWriter
w
,
String
conceptUnderscoredText
,
List
<
String
>
nostops
)
throws
IOException
{
Document
doc
=
new
Document
();
for
(
String
tword
:
nostops
)
{
doc
.
add
(
new
TextField
(
"word"
,
tword
,
Field
.
Store
.
YES
));
StoredField
strField
=
new
StoredField
(
"conceptText"
,
conceptUnderscoredText
);
doc
.
add
(
strField
);
LOG
.
info
(
"Adding to words:"
+
tword
+
" with concept name:"
+
conceptUnderscoredText
);
w
.
addDocument
(
doc
);
}
}
...
...
@@ -173,7 +197,7 @@ Text fields are useful for keyword search.
//doc.add(new StringField("sty", semantic_type, Field.Store.YES));
StoredField
styField
=
new
StoredField
(
"sty"
,
semantic_type
);
doc
.
add
(
styField
);
System
.
out
.
println
(
"Adding to concepts:"
+
conceptUnderscoredText
+
" with cui:"
+
cui
+
" with types:"
+
semantic_type
);
LOG
.
info
(
"Adding to concepts:"
+
conceptUnderscoredText
+
" with cui:"
+
cui
+
" with types:"
+
semantic_type
);
w
.
addDocument
(
doc
);
}
...
...
@@ -223,6 +247,18 @@ Text fields are useful for keyword search.
return
new
ArrayList
<
String
>(
allWords
);
}
private
void
addWord2Term
(
String
concept
,
List
<
String
>
tokens
)
{
for
(
String
tok
:
tokens
)
{
Set
<
String
>
s
=
word2term
.
get
(
tok
);
if
(
s
==
null
)
s
=
new
HashSet
<
String
>();
s
.
add
(
concept
);
word2term
.
put
(
tok
,
s
);
LOG
.
info
(
"Added/Updated "
+
tok
+
" with "
+
s
.
size
()+
" concepts"
);
}
}
}
src/main/resources/log4j.properties
View file @
b9396ae6
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger
=
DEBUG
, A1
log4j.rootLogger
=
INFO
, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1
=
org.apache.log4j.ConsoleAppender
...
...
src/main/resources/sql/oracle/umlsConceptDefSelect.sql
View file @
b9396ae6
...
...
@@ -19,6 +19,7 @@ WHERE mrconso.LAT='ENG'
-- AND mrconso.ts = 'P'
-- AND mrconso.stt = 'PF'
-- AND mrconso.ispref = 'Y'
-- AND tui IN ('T046')
-- AND tui IN ('T046','T047')
-- AND mrconso.cui='C0814136'
GROUP
BY
mrconso
.
cui
...
...
src/test/java/edu/uab/ccts/nlp/umlsIndex/test/integration/LuceneIndexIT.java
View file @
b9396ae6
...
...
@@ -4,7 +4,6 @@ import static org.junit.Assert.*;
import
java.io.IOException
;
import
java.nio.file.Paths
;
import
org.junit.Test
;
import
org.apache.lucene.analysis.standard.StandardAnalyzer
;
import
org.apache.lucene.document.Document
;
...
...
@@ -23,8 +22,10 @@ import org.apache.lucene.store.FSDirectory;
*/
public
class
LuceneIndexIT
{
private
IndexSearcher
searcher
=
null
;
private
QueryParser
parser
=
null
;
private
IndexSearcher
word2termSearcher
=
null
;
private
IndexSearcher
term2conceptSearcher
=
null
;
private
QueryParser
wordParser
=
null
;
private
QueryParser
termParser
=
null
;
/**
* Create the test case
*
...
...
@@ -32,8 +33,10 @@ public class LuceneIndexIT
*/
public
LuceneIndexIT
()
throws
IOException
{
searcher
=
new
IndexSearcher
(
DirectoryReader
.
open
(
FSDirectory
.
open
(
Paths
.
get
(
"target/index.lucene"
))));
parser
=
new
QueryParser
(
"stemmedTerms"
,
new
StandardAnalyzer
());
word2termSearcher
=
new
IndexSearcher
(
DirectoryReader
.
open
(
FSDirectory
.
open
(
Paths
.
get
(
"target/word2concept.lucene"
))));
term2conceptSearcher
=
new
IndexSearcher
(
DirectoryReader
.
open
(
FSDirectory
.
open
(
Paths
.
get
(
"target/term2concept.lucene"
))));
wordParser
=
new
QueryParser
(
"word"
,
new
StandardAnalyzer
());
termParser
=
new
QueryParser
(
"concept"
,
new
StandardAnalyzer
());
}
...
...
@@ -42,11 +45,11 @@ public class LuceneIndexIT
@org
.
junit
.
Test
public
void
testIndex
()
throws
Exception
{
TopDocs
td
=
performSearch
(
"can
cer"
,
8
);
TopDocs
td
=
performSearch
(
wordParser
,
"mucocutaneous ul
cer
s
"
,
20
);
ScoreDoc
[]
hits
=
td
.
scoreDocs
;
System
.
out
.
println
(
"Number of hits: "
+
hits
.
length
);
for
(
int
i
=
0
;
i
<
hits
.
length
;
i
++)
{
Document
hitDoc
=
s
earcher
.
doc
(
hits
[
i
].
doc
);
Document
hitDoc
=
word2termS
earcher
.
doc
(
hits
[
i
].
doc
);
System
.
out
.
println
(
hitDoc
.
get
(
"stemmedTerms"
));
System
.
out
.
println
(
hitDoc
.
get
(
"cui"
)+
" with score:"
+
hits
[
i
].
score
);
System
.
out
.
println
(
hitDoc
.
get
(
"sty"
));
...
...
@@ -54,14 +57,14 @@ public class LuceneIndexIT
}
}
public
TopDocs
performSearch
(
String
queryString
,
int
n
)
public
TopDocs
performSearch
(
QueryParser
qp
,
String
queryString
,
int
n
)
throws
IOException
,
ParseException
{
Query
query
=
p
arser
.
parse
(
queryString
);
return
s
earcher
.
search
(
query
,
n
);
Query
query
=
q
p
.
parse
(
queryString
);
return
word2termS
earcher
.
search
(
query
,
n
);
}
public
Document
getDocument
(
int
docId
)
throws
IOException
{
return
s
earcher
.
doc
(
docId
);
return
word2termS
earcher
.
doc
(
docId
);
}
}
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment