Content - e38189fdb922f33f2fa3d360cd1fd5a95d78b9ee - 10a7a5b/GenerateLuceneIndexEnglish.groovy

visit type:
https://github.com/flora-phenotype-ontology/flopoontology

22 January 2021, 07:48:45 UTC
Tip revision: cfb699302816405d6a7775ff377aaf5f05b510aa authored by Robert Hoehndorf on 06 September 2019, 01:20:44 UTC
FLOPO BioHackathon 2019 release
Tip revision: cfb6993
GenerateLuceneIndexEnglish.groovy
import org.apache.lucene.analysis.*
import org.apache.lucene.analysis.standard.*
import org.apache.lucene.document.*
import org.apache.lucene.index.*
import org.apache.lucene.store.*
import org.apache.lucene.util.*
import org.apache.lucene.search.*
import org.apache.lucene.queryparser.classic.*
import com.aliasi.medline.*
import org.apache.lucene.analysis.fr.*
import opennlp.tools.sentdetect.*
import opennlp.tools.dictionary.*
import opennlp.tools.tokenize.*
import opennlp.tools.util.*
import opennlp.tools.chunker.*
import opennlp.tools.postag.*
import opennlp.tools.namefind.*
import java.util.concurrent.*


String indexPath = "lucene-index-english"
String ontologyIndexPath = "lucene-index-ontology"

Directory dir = FSDirectory.open(new File(indexPath))
Directory ontologyIndexDir = FSDirectory.open(new File(ontologyIndexPath))
Analyzer analyzer = new StandardAnalyzer(Version.LUCENE_47)
IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_47, analyzer)
iwc.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwc.setRAMBufferSizeMB(32768.0)
IndexWriter writer = new IndexWriter(dir, iwc)

IndexWriterConfig iwcEnglish = new IndexWriterConfig(Version.LUCENE_47, analyzer)
iwcEnglish.setOpenMode(IndexWriterConfig.OpenMode.CREATE)
iwcEnglish.setRAMBufferSizeMB(32768.0)
IndexWriter englishWriter = new IndexWriter(ontologyIndexDir, iwcEnglish)

FieldType fieldType = new FieldType()
fieldType.setStoreTermVectors(true)
fieldType.setStoreTermVectorPositions(true)
fieldType.setStoreTermVectorOffsets(true)
fieldType.setStoreTermVectorPayloads(true)
fieldType.setIndexed(true)
fieldType.setTokenized(true)
fieldType.setStored(true)
fieldType.setIndexOptions(FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS)

def author = null
def year = null
List<String> rankOrder = ["order", "family", "subfamily", "tribe", "subtribe", "genus", "subgenus", "species", "subspecies", "variety"]
Map<String, Set<String>> previousCharacters = [:] // this maps taxonomic rank name to EQs
Map<String, String> previousNames = [:] // this keeps the previously encountered taxon names; ordername -> value



XmlSlurper slurper = new XmlSlurper(false, false)
slurper.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false)

/* Kew African */
def kew = slurper.parse(new File("floras-other/Kew African Flora Species.xml"))
kew.Specieslist.each { species ->
  def family = species."Name.family"[0].text()
  def genus = species."Name.genus"[0].text()
  def sname = species."Name.species"[0].text()
  def subspecies = species."infraepi"[0].text()
  def taxonString = "Family: $family, Genus: $genus, Species: $sname, Sub: $subspecies"
  def description = species."description"[0].text()
  def habitat = species."habitat"[0].text()

  /* now split the description in sentences */
  SentenceModel sentenceModel = new SentenceModel(new FileInputStream("models/en-sent.bin"))
  SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel)
  def sentences = sentenceDetector.sentDetect(description)
  sentences.each { sentence1 ->
    sentence1.split(";").each { sentence ->
      Document doc = new Document()
      doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
      doc.add(new Field("description", sentence, fieldType))
      writer.addDocument(doc)
    }
  }
  sentences = sentenceDetector.sentDetect(habitat)
  sentences.each { sentence1 ->
    sentence1.split(";").each { sentence ->
      Document doc = new Document()
      doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
      doc.add(new Field("habitat", sentence, fieldType))
      writer.addDocument(doc)
    }
  }
}



/* Flora Malesiana */
new File("flora-malesiana").eachFile { florafile ->
  def flora = slurper.parse(florafile)
  flora.treatment.each { treatment ->
    treatment.taxon.each { taxon ->

      // first try to get the taxon name
      def taxonString = ""
      def name = null
      taxon.nomenclature.homotypes.nom.each { nom ->
	if (nom.@class.text() == "accepted") {
	  name = nom
	  def lastOrderRank = ""
	  nom.name.each { nomname -> /* first we determine the level of the tree in which we currently are; we will reuse information from the higher orders
				     // mentioned before */
	    def cname = nomname.@class.text()
	    if (cname == "author") { author = nomname.text() }
	    if (cname == "year") { year = nomname.text() }
	    if (cname in rankOrder) {
	      lastOrderRank = cname
	      def cvalue = nomname.text()
	      // we delete everything from the end of the list to the current rank from previousCharacters map
		/*	      rankOrder[-1..rankOrder.indexOf(cname)].each {
			      previousCharacters[it] = null
			      previousNames[it] = null
			      }*/
	      previousNames[cname] = cvalue
	    }
	  }
	  rankOrder[0..rankOrder.indexOf(lastOrderRank)].each {
	    if (previousNames[it] != null) {
	      taxonString += "$it: "+previousNames[it]+"; "
	    }
	  }
	  taxonString += "$author; $year"
	}
      }

      // now index the descriptions: first we assemble the description text, then we sentencize, then we create a new document for each sentence
      String description = ""
      taxon.feature.each { feature ->
	if (feature.@class.text() == "description") {
	  feature.char.each { character ->
	    def cclass = character.@class.text().toLowerCase()
	    def ctext = character.text()
	    description += ctext + " "
	    //	    
	  }
	}
      }
      String habitat = ""
      taxon.feature.each { feature ->
	if (feature.@class.text() == "habitatecology") {
	  feature.char.each { character ->
	    def cclass = character.@class.text().toLowerCase()
	    def ctext = character.text()
	    habitat += ctext + " "
	    //	    
	  }
	}
      }
      /* now split the description in sentences */
      SentenceModel sentenceModel = new SentenceModel(new FileInputStream("models/en-sent.bin"))
      SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel)
      def sentences = sentenceDetector.sentDetect(description)
      sentences.each { sentence1 ->
	sentence1.split(";;;").each { sentence ->
	  Document doc = new Document()
	  doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
	  doc.add(new Field("description", sentence, TextField.TYPE_STORED))
	  writer.addDocument(doc)
	}
      }
      /* now split the description in sentences */
      sentences = sentenceDetector.sentDetect(habitat)
      sentences.each { sentence1 ->
	sentence1.split(";;;").each { sentence ->
	  Document doc = new Document()
	  doc.add(new Field("taxon", taxonString, Field.Store.YES, Field.Index.NO))
	  doc.add(new Field("habitat", sentence, TextField.TYPE_STORED))
	  writer.addDocument(doc)
	}
      }
    }
  }
}

/* Final part: we also add all the ontology terms to the index so that we can easier search for them */
/*
def ontologyDirectory = "ont/"
new File("ont").eachFile { ontfile ->
  def id = ""
  ontfile.eachLine { line ->
    if (line.startsWith("id:")) {
      id = line.substring(3).trim()
    }
    if (line.startsWith("name:")) {
      def name = line.substring(5).trim()
      Document doc = new Document()
      doc.add(new Field("id", id, Field.Store.YES, Field.Index.NO))
      doc.add(new Field("label", name, TextField.TYPE_STORED))
      englishWriter.addDocument(doc)
    }
    if (line.startsWith("synonym:")) {
      def syn = line.substring(line.indexOf("\"")+1, line.lastIndexOf("\"")).trim()
      Document doc = new Document()
      doc.add(new Field("id", id, Field.Store.YES, Field.Index.NO))
      doc.add(new Field("label", syn, TextField.TYPE_STORED))
      englishWriter.addDocument(doc)
    }
  }
}
*/
writer.close()
englishWriter.close()