diff --git a/plans/entities.plan b/plans/entities.plan index f98ea1c1a7e009f4d9006ca56f147057fbb3ad01..8523ddb14c67d45b2edc40e07b71539e55ef52cd 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -17,6 +17,10 @@ <param name="input-xslt"> <alias module="read.pubmed" param="xslTransform"/> </param> + + <param name="batch"> + <alias module="read.pubmed" param="constantDocumentFeatures"/> + </param> <param name="outputDir"> <alias module="output.doc-mesh" param="outDir"/> @@ -139,8 +143,7 @@ <param name="NCBI_taxa_ontobiotope"> <alias module="habitats.tomap-habitats.map-living-organisms" param="mappingFile"/> </param> - - + <!-- habitats--> <param name="ontobiotope-habitat"> <alias module="habitats.tomap-habitats.concept-names" param="oboFiles"/> @@ -195,14 +198,15 @@ <read> <pubmed class="XMLReader"> - <sourcePath>corpora/&corpus;/batches/&batch;/batch.xml</sourcePath> - <xslTransform>ancillaries/&corpus;-pubmed2alvisnlp.xslt</xslTransform> + <sourcePath>corpora/pubmed/batches/0001/batch.xml</sourcePath> + <xslTransform>ancillaries/microbes-2019-pubmed2alvisnlp.xslt</xslTransform> + <constantDocumentFeatures>batch=0001</constantDocumentFeatures> </pubmed> <bionlp-st class="BioNLPSTReader"> <active>true</active> <sectionName>abstract</sectionName> - <textDir>corpora/&corpus;/batches/&batch;/bionlp-st</textDir> + <textDir>corpora/BioNLP-OST-2019/batches/BB19-kb+ner/bionlp-st</textDir> </bionlp-st> </read> @@ -404,7 +408,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea/candidates.xml</xmlTermsFile> + <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> <localeDir>ancillaries/YaTeA/locale</localeDir> @@ -415,7 +419,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</xmlTermsFile> + <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <lemmaFeature>variant</lemmaFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -669,12 +673,12 @@ <output> <doc-mesh class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"doc-mesh.txt"</fileName> <lines>documents.sections:mesh</lines> <columns separator=";"> - "&batch;"; + document.@batch; document.@id; @UI; @mesh-name; @@ -683,12 +687,12 @@ </doc-mesh> <taxa class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"taxa.txt"</fileName> <lines>documents.sections.layer:taxa</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -702,12 +706,12 @@ </taxa> <microorganisms class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"microorganisms.txt"</fileName> <lines>documents.sections.layer:microorganism</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -721,12 +725,12 @@ </microorganisms> <microorganisms-short class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"microorganisms-short.txt"</fileName> <lines>documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form]</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -740,12 +744,12 @@ </microorganisms-short> <bacteria class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"bacteria.txt"</fileName> <lines>documents.sections.layer:bacteria</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -759,12 +763,12 @@ </bacteria> <habitats class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"habitats.txt"</fileName> <lines>documents.sections.layer:habitats</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -780,12 +784,12 @@ </habitats> <phenotypes class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"phenotypes.txt"</fileName> <lines>documents.sections.layer:phenotypes</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -801,12 +805,12 @@ </phenotypes> <uses class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"uses.txt"</fileName> <lines>documents.sections.layer:uses</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -822,12 +826,12 @@ </uses> <geo class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"geo.txt"</fileName> <lines>documents.sections.layer:Geographical</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; start ^ "-" ^ end; @@ -837,12 +841,12 @@ </geo> <relations class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"relations.txt"</fileName> <lines>documents.sections.relations:CooccurrenceLocalization.tuples</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Bacterium.@taxid; @@ -860,12 +864,12 @@ </relations> <relations-pheno class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"phenotype-relations.txt"</fileName> <lines>documents.sections.relations:PhenotypeRelation.tuples</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -883,12 +887,12 @@ </relations-pheno> <relations-use class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"uses-relations.txt"</fileName> <lines>documents.sections.relations:UseRelation.tuples</lines> <columns separator=";"> - "&batch;"; + document.@batch; section.document.@id; section.@name; args:Microorganism.@taxid; @@ -974,7 +978,7 @@ </index-sentences> <sentences class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"sentences.txt"</fileName> <lines>documents.sections.layer:sentences[@name != "author"]</lines> @@ -995,7 +999,7 @@ </sentences> <anaphora class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"anaphora.txt"</fileName> <lines>documents.sections.relations:coreferences.tuples[args:Ante]</lines> @@ -1028,7 +1032,7 @@ </anaphora> <dependencies class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"dependencies.txt"</fileName> <lines>documents.sections[@name != "author"].relations:dependencies.tuples</lines> @@ -1247,11 +1251,11 @@ </habitat-ancestors> <index class="AlvisDBIndexer"> - <indexDir>corpora/&corpus;/batches/&batch;/adb</indexDir> + <indexDir>adb</indexDir> <elements> <relations> <items>documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]]</items> - <id>"&batch;_" ^ id:unique</id> + <id>section.document.@batch ^ id:unique</id> <name>"Localization"</name> <type>"localization"</type> <args>args:Bacterium|args:Localization</args> @@ -1267,7 +1271,7 @@ </adb> <index class="AlvisIRIndexer"> - <indexDir>corpora/&corpus;/batches/&batch;/index</indexDir> + <indexDir>index</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1404,8 +1408,8 @@ </documents> </index> - <index-food class="AlvisIRIndexer"> - <indexDir>corpora/&corpus;/batches/&batch;/index-food</indexDir> +<!-- <index-food class="AlvisIRIndexer"> + <indexDir>.</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1538,7 +1542,7 @@ <keyword>document.@url</keyword> </fields> </documents> - </index-food> + </index-food>--> <!-- HTML visualization --> <add-feature class="Action"> @@ -1563,14 +1567,14 @@ </add-feature3> <html class="QuickHTML"> <active>false</active> - <outDir>corpora/&corpus;/batches/&batch;/html</outDir> + <outDir>./html</outDir> <classFeature>ne-type</classFeature> <layers>phenotypes,microorganism,habitats</layers> <colors>#99cc00,#ffcc99,#ffd333,#ffd666</colors> </html> <words class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"words.txt"</fileName> <lines>documents.sections[@name == "title" or @name == "abstract"].layer:words</lines> @@ -1582,7 +1586,7 @@ <bionlp-st-a2> <habitats class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats</lines> @@ -1595,7 +1599,7 @@ <phenotypes class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:phenotypes</lines> @@ -1608,7 +1612,7 @@ <microorganisms class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1621,7 +1625,7 @@ <obt class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats|layer:phenotypes</lines> @@ -1633,7 +1637,7 @@ <taxid class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1645,7 +1649,7 @@ <lives-in class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""]</lines> @@ -1657,7 +1661,7 @@ <exhibits class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> + <outDir>a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:PhenotypeRelation.tuples</lines> @@ -1670,7 +1674,7 @@ <success class="TabularExport"> - <outDir>corpora/&corpus;/batches/&batch;</outDir> + <outDir>.</outDir> <files>$</files> <fileName>"success.txt"</fileName> <lines>documents</lines> diff --git a/plans/map_habitats.plan b/plans/map_habitats.plan index 07777780e184fa6ede0bf8d55785a7e3a463d7f9..5aff349486ac1d6145c03930300193eeb36b05e4 100644 --- a/plans/map_habitats.plan +++ b/plans/map_habitats.plan @@ -168,7 +168,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> - <xmlTermsFile>ancillaries/yatea/candidates.xml</xmlTermsFile> + <xmlTermsFile>yatea/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> <localeDir>ancillaries/YaTeA/locale</localeDir> @@ -178,7 +178,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> - <xmlTermsFile>ancillaries/yatea-var/candidates.xml</xmlTermsFile> + <xmlTermsFile>yatea-var/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <lemmaFeature>variant</lemmaFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -258,7 +258,7 @@ <setFeatures/> </add-score> - <tomap-habitats file="plans/tomap-habitats-generic.plan"/> + <tomap-habitats file="plans/tomap-habitats.plan"/> <!-- <remove-living-org-overlapping-geo class="Action"> --> <!-- <target>documents.sections.layer:habitats[@concept-path ?= "OBT:000002" and span:Geographical]</target> --> diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index 46bbb98d0e1a2aadb023988465071097e7a11a72..da458fd1932cceace194e9cae09134bd2bf10027 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -3,18 +3,18 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>habitats</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats</target> <form>@concept-id</form> @@ -26,18 +26,18 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>habitats2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma2" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats2</target> <form>@concept-id</form> @@ -58,17 +58,17 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>habitats3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats3</target> <form>@concept-id</form> @@ -89,18 +89,18 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea-var/candidates.xml</yateaFile> <targetLayerName>habitats4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> <lemmaKeys/> <subject feature="variant" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats4</target> <form>@concept-id</form> @@ -121,17 +121,17 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>habitats5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">&ontobiotope;-Habitat.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" graylist="ancillaries/graylist_extended.heads" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Habitat.tomap</tomapClassifier> <subject feature="form" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats5</target> <form>@concept-id</form> @@ -196,7 +196,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> @@ -388,20 +388,20 @@ <removeFromLayer/> </delete-not-selected> -<rule-for-food-corpus> - <tag-overlap-food class="Action"> + <!-- <rule-for-food-corpus> + <tag-overlap-food class="Action"> --> <!-- <target>documents.sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> --> - <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> + <!-- <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/"]</target> <action>set:feat:overlap-food(span:habitats[@score == target.@score and @concept-path =~ "OBT:000008/"])</action> <setFeatures/> </tag-overlap-food> <remove-overlap-food class="Action"> - <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target> + <target>documents[@id in "corpora/&corpus;/food-pmids.txt"].sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target>--> <!-- <target>documents.sections.layer:habitats[not @concept-path =~ "OBT:000008/" and not @overlap-food == ""]</target> --> - <action>remove:habitats</action> + <!-- <action>remove:habitats</action> <removeFromLayer/> </remove-overlap-food> -</rule-for-food-corpus> +</rule-for-food-corpus>--> <!-- Keep only the highest scored concepts --> <keep-highest class="Action"> @@ -714,7 +714,7 @@ <!-- Add concept-path in case some are missing --> <concept-path class="OBOMapper"> - <oboFiles>&ontobiotope;-Habitat.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Habitat.obo</oboFiles> <idKeys/> <target>documents.sections.layer:habitats</target> <form>@concept-id</form> diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index 17481643bb3b90c66ab16caa337ee600eb2f1f0f..f55357aa56a7004280b8a5bae6cef17a501a97fe 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -2,18 +2,18 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes</target> <form>@concept-id</form> @@ -25,18 +25,18 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="lemma2" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes2</target> <form>@concept-id</form> @@ -57,17 +57,17 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> <subject feature="lemma" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes3</target> <form>@concept-id</form> @@ -89,18 +89,18 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea-var/candidates.xml</yateaFile> <targetLayerName>phenotypes4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> <lemmaKeys/> <subject feature="variant" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes4</target> <form>@concept-id</form> @@ -121,17 +121,17 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> - <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">&ontobiotope;-Phenotype.tomap</tomapClassifier> + <tomapClassifier empty-words="ancillaries/stopwords_EN.ttg" whole-proxy-distance="false">ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap</tomapClassifier> <subject feature="form" layer="words"/> <scoreFeature>score</scoreFeature> </tomap> <concept-names class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes5</target> <form>@concept-id</form> @@ -166,7 +166,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> @@ -394,7 +394,7 @@ <!-- <idFeature>molecule-id</idFeature> --> <!-- <nameFeature>molecule-name</nameFeature> --> <!-- <pathFeature>molecule-path</pathFeature> --> - <!-- <oboFiles>&ontobiotope;-Molecule.obo</oboFiles> --> + <!-- <oboFiles>ancillaries/BioNLP-OST+EnovFood-Molecule.obo</oboFiles> --> <!-- <subject feature="lemma" layer="words"/> --> <!-- <targetLayerName>molecules</targetLayerName> --> <!-- </molecule-projection-on-lemmas> --> @@ -404,7 +404,7 @@ <!-- <idFeature>molecule-id</idFeature> --> <!-- <nameFeature>molecule-name</nameFeature> --> <!-- <pathFeature>molecule-path</pathFeature> --> - <!-- <oboFiles>&ontobiotope;-Molecule.obo</oboFiles> --> + <!-- <oboFiles>ancillaries/BioNLP-OST+EnovFood-Molecule.obo</oboFiles> --> <!-- <subject feature="form" layer="words"/> --> <!-- <targetLayerName>molecules2</targetLayerName> --> <!-- </molecule-projection-on-words> --> @@ -469,7 +469,7 @@ <!-- Add concept-path in case some are missing --> <concept-path class="OBOMapper"> - <oboFiles>&ontobiotope;-Phenotype.obo</oboFiles> + <oboFiles>ancillaries/BioNLP-OST+EnovFood-Phenotype.obo</oboFiles> <idKeys/> <target>documents.sections.layer:phenotypes</target> <form>@concept-id</form> diff --git a/plans/use-extraction.plan b/plans/use-extraction.plan index 5d12302993b0490526cb682ad0934cca5820bda1..87195d8d5434aaa4ac887c01765ea83e3c1d4a05 100644 --- a/plans/use-extraction.plan +++ b/plans/use-extraction.plan @@ -2,7 +2,7 @@ <alvisnlp-plan id="Use-extraction"> <exact-match class="OBOProjector"> - <oboFiles>&ontobiotope-use;.obo</oboFiles> + <oboFiles>ancillaries/Use_V2.obo</oboFiles> <targetLayerName>uses</targetLayerName> <subject feature="form" layer="words"/> <idFeature>concept-id</idFeature> @@ -12,7 +12,7 @@ </exact-match> <exact-match-2 class="OBOProjector"> - <oboFiles>&ontobiotope-use;.obo</oboFiles> + <oboFiles>ancillaries/Use_V2.obo</oboFiles> <targetLayerName>uses2</targetLayerName> <subject feature="lemma" layer="words"/> <idFeature>concept-id</idFeature> diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index c6b43731d8c0626bc0a5d556b984ed992e41dab2..e50a8be95703306378fc1a5da5719bd885c49691 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -39,9 +39,14 @@ rule run_bionlp_prediction: params: batch="{B}", corpus='BioNLP-OST-2019', - inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + inhibitSyntax='inhibit-syntax', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/BioNLP-OST-2019/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -52,14 +57,14 @@ rule run_bionlp_prediction: -log {log} \ -alias format bionlp-st \ -alias input-dir {input.dir} \ - -alias input-xslt {input.xslt} \ - -alias outputDir {params.dir} \ + -outputDir {params.dir} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan} diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 2b172cf78ac91f2f0387201244ff924c6a48a7ee..f9f49e74a7c963826b96a1eed27b4ebbecd48f8c 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -24,8 +24,8 @@ rule get_cirm_bia_taxa_habitats: strain_index='1', habitat_index='15' output: - taxa='corpora/cirm/bia_taxa.txt', - habitats='corpora/cirm/bia_habitats.txt', + taxa='corpora/cirm/bia/bia_taxa.txt', + habitats='corpora/cirm/bia/bia_habitats.txt', tsv='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -42,8 +42,8 @@ rule get_cirm_yeast_taxa_habitats: taxa_index='1', habitat_index='10,11' output: - taxa='corpora/cirm/yeast_taxa.txt', - habitats='corpora/cirm/yeast_habitats.txt', + taxa='corpora/cirm/levures/yeast_taxa.txt', + habitats='corpora/cirm/levures/yeast_habitats.txt', tsv='corpora/cirm/Levures_2021/Florilege_21012021.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -61,8 +61,8 @@ rule get_cirm_cfbp_taxa_habitats: strain_index='1', habitat_index='6,10,13,14' output: - taxa='corpora/cirm/cfbp_taxa.txt', - habitats='corpora/cirm/cfbp_habitats.txt', + taxa='corpora/cirm/cfbp/cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/cfbp_habitats.txt', tsv='corpora/cirm/CFBP_2021/20210617_PPortier.tsv' conda: 'softwares/envs/python3_pandas_env.yaml' shell: """ @@ -75,9 +75,9 @@ map microorganisms ''' rule map_cirm_bia_microorganisms: input: - taxa='corpora/cirm/bia_taxa.txt' + taxa='corpora/cirm/bia/bia_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' + mapped_taxaids='corpora/cirm/bia/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -96,9 +96,9 @@ map microorganisms (CIRM Levures) ''' rule map_cirm_yeast_microorganisms: input: - taxa='corpora/cirm/yeast_taxa.txt' + taxa='corpora/cirm/levures/yeast_taxa.txt' output: - mapped_taxaids='corpora/cirm/mapped_yeast_taxa.txt' + mapped_taxaids='corpora/cirm/levures/mapped_yeast_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -118,9 +118,9 @@ map microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_microorganisms: input: - taxa='corpora/cirm/cfbp_taxa.txt' + taxa='corpora/cirm/cfbp/cfbp_taxa.txt' output: - mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' + mapped_taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt' params: plan='plans/map_microorganisms.plan', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -139,19 +139,22 @@ map habitats of microorganisms ''' rule map_cirm_habitats: input: - habitats='corpora/cirm/bia_habitats.txt' + habitats='corpora/cirm/bia/bia_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_bia_habitats.txt' + mapped_habitats='corpora/cirm/bia/mapped_bia_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/bia', + outfile='mapped_bia_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -162,19 +165,22 @@ map habitats of microorganisms (CIRM Levures) ''' rule map_cirm_yeast_habitats: input: - habitats='corpora/cirm/yeast_habitats.txt' + habitats='corpora/cirm/levures/yeast_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_yeast_habitats.txt' + mapped_habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/levures', + outfile='mapped_yeast_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -185,19 +191,22 @@ map habitats of microorganisms (CIRM CFBP) ''' rule map_cirm_cfbp_habitats: input: - habitats='corpora/cirm/cfbp_habitats.txt' + habitats='corpora/cirm/cfbp/cfbp_habitats.txt' output: - mapped_habitats='corpora/cirm/mapped_cfbp_habitats.txt' + mapped_habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' params: plan='plans/map_habitats.plan', onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/cirm/cfbp', + outfile='mapped_cfbp_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} @@ -209,8 +218,8 @@ format results rule format_cirm_results: input: file='corpora/cirm/BIA_2021/florilege_export_final_17_02_21.tsv', - taxa='corpora/cirm/mapped_bia_taxa.txt', - habitats='corpora/cirm/mapped_bia_habitats.txt' + taxa='corpora/cirm/bia/mapped_bia_taxa.txt', + habitats='corpora/cirm/bia/mapped_bia_habitats.txt' output: result='corpora/florilege/cirm/cirm-bia-results.txt' params: @@ -226,8 +235,8 @@ format results (CIRM Levures) rule format_cirm_yeast_results: input: file='corpora/cirm/Levures_2021/Florilege_21012021.tsv', - taxa='corpora/cirm/mapped_yeast_taxa.txt', - habitats='corpora/cirm/mapped_yeast_habitats.txt' + taxa='corpora/cirm/levures/mapped_yeast_taxa.txt', + habitats='corpora/cirm/levures/mapped_yeast_habitats.txt' output: result='corpora/florilege/cirm/cirm-yeast-results.txt' params: @@ -242,8 +251,8 @@ format results (CIRM CFBP) rule format_cirm_cfbp_results: input: file='corpora/cirm/CFBP_2021/20210617_PPortier.tsv', - taxa='corpora/cirm/mapped_cfbp_taxa.txt', - habitats='corpora/cirm/mapped_cfbp_habitats.txt' + taxa='corpora/cirm/cfbp/mapped_cfbp_taxa.txt', + habitats='corpora/cirm/cfbp/mapped_cfbp_habitats.txt' output: result='corpora/florilege/cirm/cirm-cfbp-results.txt' params: diff --git a/process_DSMZ_corpus.snakefile b/process_DSMZ_corpus.snakefile index b4fa67fa3a0830be87fa78e37a69270333359c55..8fde651932d3c4255ef92b8bbbfb11f822308cf3 100644 --- a/process_DSMZ_corpus.snakefile +++ b/process_DSMZ_corpus.snakefile @@ -35,11 +35,14 @@ rule map_dsmz_habitats: onto='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', - emptywords='ancillaries/stopwords_EN.ttg' + emptywords='ancillaries/stopwords_EN.ttg', + outdir='corpora/dsmz', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats} \ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ {params.plan} diff --git a/process_GenBank_corpus.snakefile b/process_GenBank_corpus.snakefile index 388f9ab1da3af0ffa392a81c44802b85b812d21b..888835e413b1864c216cebcf38a65355270f103e 100644 --- a/process_GenBank_corpus.snakefile +++ b/process_GenBank_corpus.snakefile @@ -79,11 +79,14 @@ rule map_genbank_habitats: tomap='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', graylist='ancillaries/graylist_extended.heads', emptywords='ancillaries/stopwords_EN.ttg', - inhibitSyntax='inhibit-syntax' + inhibitSyntax='inhibit-syntax', + outdir='corpora/genbank', + outfile='mapped_habitats.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ -alias input {input.habitats} \ - -alias output {output.mapped_habitats}\ + -outputDir {params.outdir} \ + -alias output {params.outfile} \ -alias ontobiotope {params.onto} \ -feat inhibit-syntax {params.inhibitSyntax} \ -xalias '<ontobiotope-tomap empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap}</ontobiotope-tomap>' \ diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 7fd42f33b809b64d119f5f88c543976bf8ebc097..5710034a3d3862750f5fcff9027c2a2411ed89db 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -46,9 +46,14 @@ rule run_pubmed_entities: params: batch="{B}", corpus='pubmed', - inhibitSyntax='inhibit-syntax', - onto='ancillaries/BioNLP-OST+EnovFood', - ontobiotopeUse='ancillaries/Use_V2', + inhibitSyntax='inhibit-syntax', + onto_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.obo', + tomap_habitat='ancillaries/BioNLP-OST+EnovFood-Habitat.tomap', + onto_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.obo', + tomap_pheno='ancillaries/BioNLP-OST+EnovFood-Phenotype.tomap', + graylist='ancillaries/graylist_extended.heads', + emptywords='ancillaries/stopwords_EN.ttg', + ontobiotopeUse='ancillaries/Use_V2.obo', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', taxid_microorganisms='ancillaries/extended-microorganisms-taxonomy/taxid_microorganisms.txt', @@ -61,13 +66,15 @@ rule run_pubmed_entities: -alias format pubmed \ -alias input {input.file} \ -alias input-xslt {input.xslt} \ - -alias outputDir {params.dir} \ + -alias batch batch={params.batch} \ + -outputDir {params.dir} \ -environmentEntities \ - -entity corpus {params.corpus} \ -feat inhibit-syntax {params.inhibitSyntax} \ - -entity ontobiotope {params.onto} \ - -entity ontobiotope-use {params.ontobiotopeUse} \ - -entity batch {params.batch} \ + -alias ontobiotope-habitat {params.onto_habitat} \ + -xalias '<ontobiotope-tomap-habitat empty-words="{params.emptywords}" graylist="{params.graylist}" whole-proxy-distance="false">{params.tomap_habitat}</ontobiotope-tomap-habitat>' \ + -alias ontobiotope-phenotypes {params.onto_pheno} \ + -xalias '<ontobiotope-tomap-phenotypes empty-words="{params.emptywords}" whole-proxy-distance="false">{params.tomap_pheno}</ontobiotope-tomap-phenotypes>' \ + -alias ontobiotope-use {params.ontobiotopeUse} \ -alias taxid_microorganisms {params.taxid_microorganisms} \ -alias taxa+id_full {params.taxa_id_full} \ {params.plan}