https://github.com/flora-phenotype-ontology/flopoontology
Tip revision: cfb699302816405d6a7775ff377aaf5f05b510aa authored by Robert Hoehndorf on 06 September 2019, 01:20:44 UTC
FLOPO BioHackathon 2019 release
FLOPO BioHackathon 2019 release
Tip revision: cfb6993
GenerateLuceneIndexEnglishSolana.groovy
@Grab( 'com.xlson.groovycsv:groovycsv:1.0' )
import static com.xlson.groovycsv.CsvParser.parseCsv
import org.apache.lucene.analysis.*
import org.apache.lucene.analysis.standard.*
import org.apache.lucene.document.*
import org.apache.lucene.index.*
import org.apache.lucene.store.*
import org.apache.lucene.util.*
import org.apache.lucene.search.*
import org.apache.lucene.queryparser.classic.*
import com.aliasi.medline.*
import org.apache.lucene.analysis.fr.*
import opennlp.tools.sentdetect.*
import opennlp.tools.dictionary.*
import opennlp.tools.tokenize.*
import opennlp.tools.util.*
import opennlp.tools.chunker.*
import opennlp.tools.postag.*
import opennlp.tools.namefind.*
import java.util.concurrent.*
String indexPath = "lucene-index-solana"
String ontologyIndexPath = "lucene-index-ontology"
Directory dir = FSDirectory.open(new File(indexPath))
Directory ontologyIndexDir = FSDirectory.open(new File(ontologyIndexPath))
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47)
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer)
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwc.setRAMBufferSizeMB(32768.0)
IndexWriter writer = new IndexWriter(dir, iwc)
IndexWriterConfig iwcEnglish = new IndexWriterConfig(Version.LUCENE_47, analyzer)
iwcEnglish.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwcEnglish.setRAMBufferSizeMB(32768.0)
IndexWriter englishWriter = new IndexWriter(ontologyIndexDir, iwcEnglish)
def author = null
def year = null
List<String> rankOrder = ["order", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "species", "subspecies", "variety"]
Map<String, Set<String>> previousCharacters = [:] // this maps taxonomic rank name to EQs
Map<String, String> previousNames = [:] // this keeps the previously encountered taxon names; ordername -> value
XmlSlurper slurper = new XmlSlurper(false, false)
slurper.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false)
def stext = new File("solana/descriptions.csv").getText()
def data = parseCsv(stext)
for (line in data) {
def description = line.Description
def taxonString = line."Taxon Name"
def habitat = line.Distribution
SentenceModel sentenceModel = new SentenceModel(new FileInputStream("models/en-sent.bin"))
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel)
def sentences = sentenceDetector.sentDetect(description)
sentences.each { sentence1 ->
sentence1.split(";").each { sentence ->
Document doc = new Document()
doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
doc.add(new Field("description", sentence, TextField.TYPE_STORED))
writer.addDocument(doc)
}
}
println "Added $taxonString"
}
/*
SentenceModel sentenceModel = new SentenceModel(new FileInputStream("models/en-sent.bin"))
SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel)
def sentences = sentenceDetector.sentDetect(description)
sentences = sentenceDetector.sentDetect(habitat)
sentences.each { sentence1 ->
sentence1.split(";").each { sentence ->
Document doc = new Document()
doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
doc.add(new Field("habitat", sentence, TextField.TYPE_STORED))
writer.addDocument(doc)
}
}
}
*/
/* Final part: we also add all the ontology terms to the index so that we can easier search for them */
/*
def ontologyDirectory = "ont/"
new File("ont").eachFile { ontfile ->
def id = ""
ontfile.eachLine { line ->
if (line.startsWith("id:")) {
id = line.substring(3).trim()
}
if (line.startsWith("name:")) {
def name = line.substring(5).trim()
Document doc = new Document()
doc.add(new Field("id", id, Field.Store.YES, Field.Index.NO))
doc.add(new Field("label", name, TextField.TYPE_STORED))
englishWriter.addDocument(doc)
}
if (line.startsWith("synonym:")) {
def syn = line.substring(line.indexOf("\"")+1, line.lastIndexOf("\"")).trim()
Document doc = new Document()
doc.add(new Field("id", id, Field.Store.YES, Field.Index.NO))
doc.add(new Field("label", syn, TextField.TYPE_STORED))
englishWriter.addDocument(doc)
}
}
}
*/
writer.close()
englishWriter.close()