Commit 81b2cb15 authored by John David Osborne's avatar John David Osborne
Browse files

Initial commit, fails with java.nio.file.NoSuchFileException:...

Initial commit, fails with java.nio.file.NoSuchFileException: /Users/ozborn/code/repo/umlsIndex/target/index.lucene/write.lock
parent e9f95d69
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>edu.uab.ccts.nlp</groupId>
<artifactId>umlsIndex</artifactId>
<version>0.0.1-SNAPSHOT</version>
<packaging>jar</packaging>
<name>umlsIndex</name>
<url>http://maven.apache.org</url>
<properties>
<maven.compiler.target>1.8</maven.compiler.target>
<maven.compiler.source>1.8</maven.compiler.source>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.4</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>com.oracle</groupId>
<artifactId>ojdbc7</artifactId>
<version>12.1.0.1</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>6.0.0</version>
</dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>6.0.0</version>
</dependencies>
</project>
package edu.uab.ccts.nlp.umlsIndex;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Paths;
import java.sql.Connection;
import java.sql.DriverManager;
import java.sql.ResultSet;
import java.sql.Statement;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.FSDirectory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class UmlsIndexWriter {
private static final Logger LOG = LoggerFactory.getLogger(UmlsIndexWriter.class);
private static final String UMLS_CONCEPT_RETRIEVE_SQL_PATH = "sql/oracle/umlsConceptDefSelect.sql";
private String jdbcConnectString;
/**
* Write a simple UMLS Indexer
* @param args
*/
public static void main(String args[]){
UmlsIndexWriter uiw = new UmlsIndexWriter(args[0]);
uiw.buildIndex();
}
public UmlsIndexWriter(String umlsDbString) { this.jdbcConnectString = umlsDbString; }
public void buildIndex(){
LOG.info("jdbcString:\n"+jdbcConnectString);
String fetchsql = getTextFromFile(UMLS_CONCEPT_RETRIEVE_SQL_PATH);
try(Connection con = DriverManager.getConnection(jdbcConnectString)){
Statement st = con.createStatement();
ResultSet rs = st.executeQuery(fetchsql);
StandardAnalyzer analyzer = new StandardAnalyzer();
FSDirectory index = FSDirectory.open(Paths.get("target/index.lucene"));
IndexWriterConfig config = new IndexWriterConfig(analyzer);
IndexWriter w = new IndexWriter(index, config);
while(rs.next()){
String cui = rs.getString(1);
String termText = rs.getString(2);
String semanticType = rs.getString(3);
addDoc(w, cui,termText,semanticType);
rs.next();
}
w.close();
rs.close();
} catch (Exception e) { e.printStackTrace(); }
}
private String getTextFromFile(String path) {
StringBuilder sb = new StringBuilder();
File test = new File(path);
InputStream in = null;
try {
if (test.exists() && test.isFile() && test.canRead()) {
in = new FileInputStream(path);
} else {
in = this.getClass().getClassLoader().getResourceAsStream(path);
}
try(BufferedReader br = new BufferedReader(new InputStreamReader(in))){
while(br.ready()) {
sb.append(br.readLine()+"\n");
}
} catch (Exception e) { e.printStackTrace(); }
} catch (Exception e) { e.printStackTrace();}
return sb.toString();
}
/**
* A text field is a sequence of terms that has been tokenized while a string
field is a single term (although it can also be multivalued.)
Punctuation and spacing is ignored for text fields. Text tends to be
lowercased, stemmed, and even stop words removed. You tend to search text
using a handful of keywords whose exact order is not required, although
quoted phrases can be used as well. Fuzzy queries can be done on individual
terms (words). Wildcards as well.
String fields are literal character strings with all punctuation, spacing,
and case preserved. Anything other than exact match is done using wildcards,
although I suppose fuzzy query should work as well.
String fields are useful for facets and filter queries or display.
Text fields are useful for keyword search.
* @param w
* @param title
* @param isbn
* @throws IOException
*/
private void addDoc(IndexWriter w, String cui, String raw_text, String semantic_type) throws IOException {
Document doc = new Document();
String stemmedTerms = stemText(raw_text);
doc.add(new TextField("stemmedTerms", stemmedTerms, Field.Store.YES));
doc.add(new StringField("cui", cui, Field.Store.YES));
doc.add(new StringField("sty", semantic_type, Field.Store.YES));
w.addDocument(doc);
}
private String stemText(String raw_text) { return raw_text; }
}
# Set root logger level to DEBUG and its only appender to A1.
log4j.rootLogger=DEBUG, A1
# A1 is set to be a ConsoleAppender.
log4j.appender.A1=org.apache.log4j.ConsoleAppender
# A1 uses PatternLayout.
log4j.appender.A1.layout=org.apache.log4j.PatternLayout
log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
SELECT
thetable.thecui,
minconstr || ' ' || maxconstr || ' ' || modeconstr,
mintui||' '||maxtui||' '||modetui
-- || ' ' || modecondef
FROM
(
SELECT
mrconso.cui as thecui
-- ,listagg(tui,' ') WITHIN GROUP (ORDER BY tui) as stypes
,MIN(tui) as mintui,MAX(tui) as maxtui,stats_mode(tui) as modetui --12 characters
,cast (MIN(mrconso.str) as varchar(150)) as minconstr
,cast (MAX(mrconso.str) as varchar(150)) as maxconstr
,cast(stats_mode(mrconso.str) as varchar(150)) as modeconstr -- 450 characters for strings
,cast(stats_mode(mrdef.def)as varchar(3500)) as modecondef
FROM mrconso LEFT JOIN MRDEF ON (mrconso.cui=mrdef.cui)
JOIN MRSTY ON (mrconso.cui=mrsty.cui)
WHERE mrconso.LAT='ENG'
-- AND mrconso.ts = 'P'
-- AND mrconso.stt = 'PF'
-- AND mrconso.ispref = 'Y'
-- AND tui IN ('T046','T047')
GROUP BY mrconso.cui
) thetable
package edu.uab.ccts.nlp.umlsIndex;
import junit.framework.Test;
import junit.framework.TestCase;
import junit.framework.TestSuite;
/**
* Unit test for simple App.
*/
public class AppTest
extends TestCase
{
/**
* Create the test case
*
* @param testName name of the test case
*/
public AppTest( String testName )
{
super( testName );
}
/**
* @return the suite of tests being tested
*/
public static Test suite()
{
return new TestSuite( AppTest.class );
}
/**
* Rigourous Test :-)
*/
public void testApp()
{
assertTrue( true );
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment