From 9152896d320959e99f54114a0860ff3c1e14cb04 Mon Sep 17 00:00:00 2001 From: Mouhamadou Ba <mouhamadou.ba@inra.fr> Date: Tue, 6 Apr 2021 11:58:16 +0200 Subject: [PATCH] harmonize folders --- .../batch/0001/batch.xml | 0 .../batch/0007/batch.xml | 0 .../microbes-2019-pubmed2alvisnlp.xslt | 0 docs/6-process-pubmed-data.md | 8 +-- docs/run.md | 4 +- docs/stats.md | 16 ++--- generate_stats.snakefile | 38 +++++----- plans/entities.plan | 70 +++++++++---------- plans/tomap-habitats.plan | 12 ++-- plans/tomap-microbial-phenotypes.plan | 12 ++-- process-evaluate_BioNLP-OST.snakefile | 38 +++++----- process_PubMed_corpus.snakefile | 55 ++++++++------- 12 files changed, 127 insertions(+), 126 deletions(-) rename corpora/{microbes-2019 => pubmed}/batch/0001/batch.xml (100%) rename corpora/{microbes-2019 => pubmed}/batch/0007/batch.xml (100%) rename corpora/{microbes-2019 => pubmed}/microbes-2019-pubmed2alvisnlp.xslt (100%) diff --git a/corpora/microbes-2019/batch/0001/batch.xml b/corpora/pubmed/batch/0001/batch.xml similarity index 100% rename from corpora/microbes-2019/batch/0001/batch.xml rename to corpora/pubmed/batch/0001/batch.xml diff --git a/corpora/microbes-2019/batch/0007/batch.xml b/corpora/pubmed/batch/0007/batch.xml similarity index 100% rename from corpora/microbes-2019/batch/0007/batch.xml rename to corpora/pubmed/batch/0007/batch.xml diff --git a/corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt b/corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt similarity index 100% rename from corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt rename to corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt diff --git a/docs/6-process-pubmed-data.md b/docs/6-process-pubmed-data.md index 71a0e2b2..7271d1d3 100644 --- a/docs/6-process-pubmed-data.md +++ b/docs/6-process-pubmed-data.md @@ -1,6 +1,6 @@ ## About The pipeline extracts microorganisms, habitats of texts from Pubmed. The workflow uses alvisnlp plan `plans/entities.plan`. -Pubmed corpus is split into several batches that are available into `corpora/microbes-2019/batch`. +Pubmed corpus is split into several batches that are available into `corpora/pubmed/batches`. The bacthes are automatically scanned by the pipeline. <img align="right" width="800" src="6-pipeline.svg"> @@ -56,13 +56,13 @@ The pipeline relies on the following alvisnlp plan: The pipeline handles the following resources : * inputs - * `corpora/microbes-2019` + * `corpora/pubmed` * `ancillaries/OntoBiotope_BioNLP-OST-2019-Habitat.obo` * `ancillaries/OntoBiotope_BioNLP-OST-2019-Phenotype.obo` * `ancillaries/Use_V2.obo` * outputs - * `corpora/microbes-2019/expander` - * `corpora/microbes-2019/index` + * `corpora/pubmed/expander` + * `corpora/pubmed/index` * `ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt` * `ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt` * programs diff --git a/docs/run.md b/docs/run.md index 9f9bf62f..7d59c9ca 100644 --- a/docs/run.md +++ b/docs/run.md @@ -15,8 +15,8 @@ conda activate snakemake-5.13.0-env * CIRM corpus is processed in **step 3.**, the corpus and results are localized into the `corpora/cirm` folder. * GenBank corpus is processed in **step 4.**, the corpus and results are localized into the `corpora/genbank` folder. * DSMZ corpus is processed in **step 5.**, the corpus and results are localized into the `corpora/dsmz` folder. -* Pubmed corpus is processed in **step 6.**, the corpus and results are localized into the `corpora/microbes-2019` folder. -Pubmed corpus is to split into several batches to put in the `corpora/microbes-2019/batch` folder. +* Pubmed corpus is processed in **step 6.**, the corpus and results are localized into the `corpora/pubmed` folder. +Pubmed corpus is to split into several batches to put in the `corpora/pubmed/batches` folder. ## Expander diff --git a/docs/stats.md b/docs/stats.md index 81008c60..e9eae40e 100644 --- a/docs/stats.md +++ b/docs/stats.md @@ -2,14 +2,14 @@ | SOURCE | LABEL | COUNT | | -------- |-------- |-------- | -| Pubmed | nb articles | count_files(corpora/microbes-2019/batch/*) x 1000 | -| Pubmed | nb habitats | count_lines(corpora/microbes-2019/habitats.full.txt) | -| Pubmed | nb relations | count_lines(corpora/microbes-2019/relations.full.txt) | -| pubmed | nb microorganisms | count_lines(corpora/microbes-2019/microorganisms.full.txt) | -| Pubmed | nb uses | count_lines(corpora/microbes-2019/uses.full.txt) | -| Pubmed | nb phenotype-relations | count_lines(corpora/microbes-2019/phenotype-relations.full.txt) | -| Pubmed | nb uses-relations | count_lines(corpora/microbes-2019/uses-relations.full.txt) | -| Pubmed | nb phenotypes | count_lines(corpora/microbes-2019/phenotypes.full.txt) | +| Pubmed | nb articles | count_files(corpora/pubmed/batches/*) x 1000 | +| Pubmed | nb habitats | count_lines(corpora/pubmed/habitats.full.txt) | +| Pubmed | nb relations | count_lines(corpora/pubmed/relations.full.txt) | +| pubmed | nb microorganisms | count_lines(corpora/pubmed/microorganisms.full.txt) | +| Pubmed | nb uses | count_lines(corpora/pubmed/uses.full.txt) | +| Pubmed | nb phenotype-relations | count_lines(corpora/pubmed/phenotype-relations.full.txt) | +| Pubmed | nb uses-relations | count_lines(corpora/pubmed/uses-relations.full.txt) | +| Pubmed | nb phenotypes | count_lines(corpora/pubmed/phenotypes.full.txt) | | cirm | nb entrees | count_lines(corpora/cirm/2019-07-05/extraction_3-fv.csv) | | cirm | nb yeast entrees | count_lines(corpora/cirm/Levures_2017/data_CIRM_levures_extraction_09032017.csv) | | cirm | nb entites | count_lines(corpora/cirm/mapped_taxids.txt) | diff --git a/generate_stats.snakefile b/generate_stats.snakefile index 8504c5d3..40c76264 100644 --- a/generate_stats.snakefile +++ b/generate_stats.snakefile @@ -10,7 +10,7 @@ rule all: -SOURCES=["cirm", "genbank", "dsmz", "microbes-2019", "BioNLP-OST-2019"] +SOURCES=["cirm", "genbank", "dsmz", "pubmed", "BioNLP-OST-2019"] ''' cirm | nb entrees | count_lines(corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx) @@ -165,14 +165,14 @@ rule merge_stats_dsmz: ''' -Pubmed | nb articles | count_files(corpora/microbes-2019/batch/*) x 1000 -Pubmed | nb habitats | count_lines(corpora/microbes-2019/habitats.full.txt) -Pubmed | nb relations | count_lines(corpora/microbes-2019/relations.full.txt) -pubmed | nb microorganisms | count_lines(corpora/microbes-2019/microorganisms.full.txt) -Pubmed | nb uses | count_lines(corpora/microbes-2019/uses.full.txt) -Pubmed | nb phenotype-relations | count_lines(corpora/microbes-2019/phenotype-relations.full.txt) -Pubmed | nb uses-relations | count_lines(corpora/microbes-2019/uses-relations.full.txt) -Pubmed | nb phenotypes | count_lines(corpora/microbes-2019/phenotypes.full.txt) +Pubmed | nb articles | count_files(corpora/pubmed/batches/*) x 1000 +Pubmed | nb habitats | count_lines(corpora/pubmed/habitats.full.txt) +Pubmed | nb relations | count_lines(corpora/pubmed/relations.full.txt) +pubmed | nb microorganisms | count_lines(corpora/pubmed/microorganisms.full.txt) +Pubmed | nb uses | count_lines(corpora/pubmed/uses.full.txt) +Pubmed | nb phenotype-relations | count_lines(corpora/pubmed/phenotype-relations.full.txt) +Pubmed | nb uses-relations | count_lines(corpora/pubmed/uses-relations.full.txt) +Pubmed | nb phenotypes | count_lines(corpora/pubmed/phenotypes.full.txt) ''' ENTREES_PUBMED = ["list_of_batches.txt"] SORTIES_PUBMED = ["relations.full.txt", "phenotype-relations.full.txt", "uses-relations.full.txt", "microorganisms.full.txt", "habitats.full.txt", "phenotypes.full.txt", "uses.full.txt"] @@ -184,11 +184,11 @@ FILES_PUBMED = ENTREES_PUBMED + SORTIES_PUBMED rule stats_pubmed: input: - file="corpora/microbes-2019/{file}" + file="corpora/pubmed/{file}" output: - stats="corpora/microbes-2019/stats/{file}_stats.csv" + stats="corpora/pubmed/stats/{file}_stats.csv" params: - result="microbes-2019/{file}", + result="pubmed/{file}", c0="source", v0="pubmed", c1="uri", @@ -205,9 +205,9 @@ merge ''' rule merge_stats_pubmed: input: - files=expand("corpora/microbes-2019/stats/{file}_stats.csv", file=FILES_PUBMED) + files=expand("corpora/pubmed/stats/{file}_stats.csv", file=FILES_PUBMED) output: - result="corpora/microbes-2019/stats/stats.full.csv" + result="corpora/pubmed/stats/stats.full.csv" run: import pandas frames = [ pandas.read_csv(f) for f in input.files ] @@ -229,8 +229,8 @@ eval_BB19-rel+ner_003, score sur la prédiction des Exhibits de BB19-rel+ner,Bio eval_BB19-kb+ner_001, mesure pour l'evaluation de BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner/eval.json#eval_BB19-kb+ner_001#eval_BB19-kb+ner_002 eval_BB19-kb+ner_002, score moyen sur BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner/eval.json#eval_BB19-kb+ner_002 ''' -ENTREES_EVAL = ["BioNLP-OST-2019/batch/BB19-norm+ner", "BioNLP-OST-2019/batch/BB19-rel+ner", "BioNLP-OST-2019/batch/BB19-kb+ner"] -SORTIES_EVAL = ["BioNLP-OST-2019/batch/BB19-norm+ner/eval.json", "BioNLP-OST-2019/batch/BB19-rel+ner/eval.json", "BioNLP-OST-2019/batch/BB19-kb+ner/eval.json"] +ENTREES_EVAL = ["BioNLP-OST-2019/batches/BB19-norm+ner", "BioNLP-OST-2019/batches/BB19-rel+ner", "BioNLP-OST-2019/batches/BB19-kb+ner"] +SORTIES_EVAL = ["BioNLP-OST-2019/batches/BB19-norm+ner/eval.json", "BioNLP-OST-2019/batches/BB19-rel+ner/eval.json", "BioNLP-OST-2019/batches/BB19-kb+ner/eval.json"] FILES_EVAL = ["BB19-norm+ner", "BB19-rel+ner", "BB19-kb+ner"] def get_score_stats(file, entity): @@ -249,7 +249,7 @@ def get_score_stats(file, entity): ''' rule stats_eval_BB19_norm: input: - file="corpora/BioNLP-OST-2019/batch/BB19-norm+ner/eval.json" + file="corpora/BioNLP-OST-2019/batches/BB19-norm+ner/eval.json" output: stats="corpora/BioNLP-OST-2019/stats/BB19-norm+ner_stats.csv" params: @@ -274,7 +274,7 @@ rule stats_eval_BB19_norm: ''' rule stats_eval_BB19_rel: input: - file="corpora/BioNLP-OST-2019/batch/BB19-rel+ner/eval.json" + file="corpora/BioNLP-OST-2019/batches/BB19-rel+ner/eval.json" output: stats="corpora/BioNLP-OST-2019/stats/BB19-rel+ner_stats.csv" params: @@ -299,7 +299,7 @@ rule stats_eval_BB19_rel: ''' rule stats_eval_BB19_kb: input: - file="corpora/BioNLP-OST-2019/batch/BB19-kb+ner/eval.json" + file="corpora/BioNLP-OST-2019/batches/BB19-kb+ner/eval.json" output: stats="corpora/BioNLP-OST-2019/stats/BB19-kb+ner_stats.csv" params: diff --git a/plans/entities.plan b/plans/entities.plan index 80138040..9eef3d8f 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -195,14 +195,14 @@ <read> <pubmed class="XMLReader"> - <sourcePath>corpora/&corpus;/batch/&batch;/batch.xml</sourcePath> + <sourcePath>corpora/&corpus;/batches/&batch;/batch.xml</sourcePath> <xslTransform>ancillaries/&corpus;-pubmed2alvisnlp.xslt</xslTransform> </pubmed> <bionlp-st class="BioNLPSTReader"> <active>true</active> <sectionName>abstract</sectionName> - <textDir>corpora/&corpus;/batch/&batch;/bionlp-st</textDir> + <textDir>corpora/&corpus;/batches/&batch;/bionlp-st</textDir> </bionlp-st> </read> @@ -407,7 +407,7 @@ <!-- Run Yatea term extractor --> <yatea class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>corpora/&corpus;/batch/&batch;/yatea/candidates.xml</xmlTermsFile> + <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> <localeDir>ancillaries/YaTeA/locale</localeDir> @@ -418,7 +418,7 @@ <!-- Run Yatea term extractor on variants --> <yatea-var class="YateaExtractor"> <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> - <xmlTermsFile>corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</xmlTermsFile> + <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</xmlTermsFile> <posFeature>tt_pos</posFeature> <lemmaFeature>variant</lemmaFeature> <configDir>ancillaries/YaTeA/config-habitats</configDir> @@ -672,7 +672,7 @@ <output> <doc-mesh class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"doc-mesh.txt"</fileName> <lines>documents.sections:mesh</lines> @@ -686,7 +686,7 @@ </doc-mesh> <taxa class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"taxa.txt"</fileName> <lines>documents.sections.layer:taxa</lines> @@ -705,7 +705,7 @@ </taxa> <microorganisms class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"microorganisms.txt"</fileName> <lines>documents.sections.layer:microorganism</lines> @@ -724,7 +724,7 @@ </microorganisms> <microorganisms-short class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"microorganisms-short.txt"</fileName> <lines>documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form]</lines> @@ -743,7 +743,7 @@ </microorganisms-short> <bacteria class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"bacteria.txt"</fileName> <lines>documents.sections.layer:bacteria</lines> @@ -762,7 +762,7 @@ </bacteria> <habitats class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"habitats.txt"</fileName> <lines>documents.sections.layer:habitats</lines> @@ -783,7 +783,7 @@ </habitats> <phenotypes class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"phenotypes.txt"</fileName> <lines>documents.sections.layer:phenotypes</lines> @@ -804,7 +804,7 @@ </phenotypes> <uses class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"uses.txt"</fileName> <lines>documents.sections.layer:uses</lines> @@ -825,7 +825,7 @@ </uses> <geo class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"geo.txt"</fileName> <lines>documents.sections.layer:Geographical</lines> @@ -840,7 +840,7 @@ </geo> <relations class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"relations.txt"</fileName> <lines>documents.sections.relations:CooccurrenceLocalization.tuples</lines> @@ -863,7 +863,7 @@ </relations> <relations-pheno class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"phenotype-relations.txt"</fileName> <lines>documents.sections.relations:PhenotypeRelation.tuples</lines> @@ -886,7 +886,7 @@ </relations-pheno> <relations-use class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"uses-relations.txt"</fileName> <lines>documents.sections.relations:UseRelation.tuples</lines> @@ -909,7 +909,7 @@ </relations-use> <!-- <rdf class="RDFExport"> --> - <!-- <outDir>corpora/&corpus;/batch/&batch;</outDir> --> + <!-- <outDir>corpora/&corpus;/batches/&batch;</outDir> --> <!-- <files>$</files> --> <!-- <fileName>"&batch;.ttl"</fileName> --> <!-- <format>turtle</format> --> @@ -941,7 +941,7 @@ <!-- <module id="document-richness" class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"document-richness.txt"</fileName> <lines>documents[sections[layer:microorganism and layer:habitats]]</lines> @@ -977,7 +977,7 @@ </index-sentences> <sentences class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"sentences.txt"</fileName> <lines>documents.sections.layer:sentences[@name != "author"]</lines> @@ -998,7 +998,7 @@ </sentences> <anaphora class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"anaphora.txt"</fileName> <lines>documents.sections.relations:coreferences.tuples[args:Ante]</lines> @@ -1031,7 +1031,7 @@ </anaphora> <dependencies class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"dependencies.txt"</fileName> <lines>documents.sections[@name != "author"].relations:dependencies.tuples</lines> @@ -1163,7 +1163,7 @@ <!-- <module id="annotation" class="ExportCadixeJSON"> --> <!-- <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> --> - <!-- <outDir>corpora/&corpus;/batch/&batch;/json/annotation</outDir> --> + <!-- <outDir>corpora/&corpus;/batches/&batch;/json/annotation</outDir> --> <!-- <documentDescription>"[" ^ @id ^ "] " ^ sections:title.contents</documentDescription> --> <!-- <documentProperties>DocumentID=@id</documentProperties> --> <!-- <annotationSets> --> @@ -1250,7 +1250,7 @@ </habitat-ancestors> <index class="AlvisDBIndexer"> - <indexDir>corpora/&corpus;/batch/&batch;/adb</indexDir> + <indexDir>corpora/&corpus;/batches/&batch;/adb</indexDir> <elements> <relations> <items>documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]]</items> @@ -1270,7 +1270,7 @@ </adb> <index class="AlvisIRIndexer"> - <indexDir>corpora/&corpus;/batch/&batch;/index</indexDir> + <indexDir>corpora/&corpus;/batches/&batch;/index</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1408,7 +1408,7 @@ </index> <index-food class="AlvisIRIndexer"> - <indexDir>corpora/&corpus;/batch/&batch;/index-food</indexDir> + <indexDir>corpora/&corpus;/batches/&batch;/index-food</indexDir> <tokenPositionGap>9216</tokenPositionGap> <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames> <relations> @@ -1566,14 +1566,14 @@ </add-feature3> <html class="QuickHTML"> <active>false</active> - <outDir>corpora/&corpus;/batch/&batch;/html</outDir> + <outDir>corpora/&corpus;/batches/&batch;/html</outDir> <classFeature>ne-type</classFeature> <layers>phenotypes,microorganism,habitats</layers> <colors>#99cc00,#ffcc99,#ffd333,#ffd666</colors> </html> <words class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"words.txt"</fileName> <lines>documents.sections[@name == "title" or @name == "abstract"].layer:words</lines> @@ -1585,7 +1585,7 @@ <bionlp-st-a2> <habitats class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats</lines> @@ -1598,7 +1598,7 @@ <phenotypes class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:phenotypes</lines> @@ -1611,7 +1611,7 @@ <microorganisms class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1624,7 +1624,7 @@ <obt class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:habitats|layer:phenotypes</lines> @@ -1636,7 +1636,7 @@ <taxid class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>layer:microorganism</lines> @@ -1648,7 +1648,7 @@ <lives-in class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""]</lines> @@ -1660,7 +1660,7 @@ <exhibits class="TabularExport"> <append/> - <outDir>corpora/&corpus;/batch/&batch;/a2</outDir> + <outDir>corpora/&corpus;/batches/&batch;/a2</outDir> <files>documents.sections</files> <fileName>document.@id ^ ".a2"</fileName> <lines>relations:PhenotypeRelation.tuples</lines> @@ -1673,7 +1673,7 @@ <success class="TabularExport"> - <outDir>corpora/&corpus;/batch/&batch;</outDir> + <outDir>corpora/&corpus;/batches/&batch;</outDir> <files>$</files> <fileName>"success.txt"</fileName> <lines>documents</lines> diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan index d6abf84a..e5dcec1c 100644 --- a/plans/tomap-habitats.plan +++ b/plans/tomap-habitats.plan @@ -3,7 +3,7 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -26,7 +26,7 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -58,7 +58,7 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -89,7 +89,7 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> <targetLayerName>habitats4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -121,7 +121,7 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>habitats5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -196,7 +196,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan index d6b4a22f..17481643 100644 --- a/plans/tomap-microbial-phenotypes.plan +++ b/plans/tomap-microbial-phenotypes.plan @@ -2,7 +2,7 @@ <!-- ToMap on lemmas --> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -25,7 +25,7 @@ <tomap-on-alternative-lemmas> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes2</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -57,7 +57,7 @@ <tomap-no-lemmakeys> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes3</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -89,7 +89,7 @@ <tomap-on-variants> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile> <targetLayerName>phenotypes4</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -121,7 +121,7 @@ <tomap-no-lemmakeys-word-form> <tomap class="TomapProjector"> - <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> + <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <targetLayerName>phenotypes5</targetLayerName> <conceptFeature>concept-id</conceptFeature> <explanationFeaturePrefix>explain_</explanationFeaturePrefix> @@ -166,7 +166,7 @@ <bioyatea-projection class="YateaTermsProjector"> <targetLayerName>yateaTerms</targetLayerName> <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>--> - <yateaFile output-feed="yes">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> + <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? --> <subject layer="words"/> <termLemma>lemma</termLemma> </bioyatea-projection> diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index e9f494ff..d3dca3e6 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -13,8 +13,8 @@ all ''' rule all: input: - log=expand("corpora/BioNLP-OST-2019/batch/{B}/eval.json", B=BATCHES), - scores=expand("corpora/BioNLP-OST-2019/batch/{B}/eval.json", B=BATCHES) + log=expand("corpora/BioNLP-OST-2019/batches/{B}/eval.json", B=BATCHES), + scores=expand("corpora/BioNLP-OST-2019/batches/{B}/eval.json", B=BATCHES) ''' @@ -23,19 +23,19 @@ batches using the alvisnlp plan (entities.plan) ''' rule run_bionlp_prediction: input: - dir=directory("corpora/BioNLP-OST-2019/batch/{B}/bionlp-st"), - xslt="corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt" + dir=directory("corpora/BioNLP-OST-2019/batches/{B}/bionlp-st"), + xslt="corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt" output: - relations="corpora/BioNLP-OST-2019/batch/{B}/relations.txt", - phenotypeRelations="corpora/BioNLP-OST-2019/batch/{B}/phenotype-relations.txt", - usesRelations="corpora/BioNLP-OST-2019/batch/{B}/uses-relations.txt", - microorganisms="corpora/BioNLP-OST-2019/batch/{B}/microorganisms.txt", - habitats="corpora/BioNLP-OST-2019/batch/{B}/habitats.txt", - phenotypes="corpora/BioNLP-OST-2019/batch/{B}/phenotypes.txt", - uses="corpora/BioNLP-OST-2019/batch/{B}/uses.txt", - index=directory("corpora/BioNLP-OST-2019/batch/{B}/index"), - a2=directory("corpora/BioNLP-OST-2019/batch/{B}/a2") - log:"corpora/BioNLP-OST-2019/batch/{B}/alvisnlp.log" + relations="corpora/BioNLP-OST-2019/batches/{B}/relations.txt", + phenotypeRelations="corpora/BioNLP-OST-2019/batches/{B}/phenotype-relations.txt", + usesRelations="corpora/BioNLP-OST-2019/batches/{B}/uses-relations.txt", + microorganisms="corpora/BioNLP-OST-2019/batches/{B}/microorganisms.txt", + habitats="corpora/BioNLP-OST-2019/batches/{B}/habitats.txt", + phenotypes="corpora/BioNLP-OST-2019/batches/{B}/phenotypes.txt", + uses="corpora/BioNLP-OST-2019/batches/{B}/uses.txt", + index=directory("corpora/BioNLP-OST-2019/batches/{B}/index"), + a2=directory("corpora/BioNLP-OST-2019/batches/{B}/a2") + log:"corpora/BioNLP-OST-2019/batches/{B}/alvisnlp.log" params: batch="{B}", corpus='BioNLP-OST-2019', @@ -43,7 +43,7 @@ rule run_bionlp_prediction: onto='ancillaries/BioNLP-OST+EnovFood', ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', - dir='corpora/BioNLP-OST-2019/batch/{B}/', + dir='corpora/BioNLP-OST-2019/batches/{B}/', taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] @@ -67,18 +67,18 @@ rule run_bionlp_prediction: rule archive_prediction: input: - a2=directory("corpora/BioNLP-OST-2019/batch/{B}/a2") + a2=directory("corpora/BioNLP-OST-2019/batches/{B}/a2") output: - zip="corpora/BioNLP-OST-2019/batch/{B}/predictions.zip" + zip="corpora/BioNLP-OST-2019/batches/{B}/predictions.zip" shell: """zip -9 {output.zip} {input.a2}/*.a2""" rule evaluate: input: - zip="corpora/BioNLP-OST-2019/batch/{B}/predictions.zip" + zip="corpora/BioNLP-OST-2019/batches/{B}/predictions.zip" output: - scores="corpora/BioNLP-OST-2019/batch/{B}/eval.json" + scores="corpora/BioNLP-OST-2019/batches/{B}/eval.json" params: api=config["BIONLPOST_API"], task="{B}" diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index b5a20ac3..16c8bef8 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -1,5 +1,6 @@ ## local rule # localrules: all, concat_results +localrules:create_dummy_bionlp_st_dir ## config file configfile: "config/config.yaml" @@ -16,10 +17,10 @@ all ''' rule all: input: - results=expand("corpora/microbes-2019/{R}.full.txt", R=RESULTS), - log=expand("corpora/microbes-2019/batch/{B}/alvisnlp.log", B=BATCHES), - index_folder="corpora/microbes-2019/index", - expander_folder="corpora/microbes-2019/expander", + results=expand("corpora/pubmed/{R}.full.txt", R=RESULTS), + log=expand("corpora/pubmed/batches/{B}/alvisnlp.log", B=BATCHES), + index_folder="corpora/pubmed/index", + expander_folder="corpora/pubmed/expander", florilege_Habitat_result="corpora/florilege/pubmed/PubMed-Habitat.txt", florilege_Phenotype_result="corpora/florilege/pubmed/PubMed-Phenotype.txt", florilege_Use_result="corpora/florilege/pubmed/PubMed-Use.txt" @@ -27,7 +28,7 @@ rule all: rule create_dummy_bionlp_st_dir: output: - dummy=directory("corpora/microbes-2019/batch/{B}/bionlp-st") + dummy=directory("corpora/pubmed/batches/{B}/bionlp-st") shell: '''mkdir -p {output.dummy}''' @@ -38,27 +39,27 @@ batches using the alvisnlp plan (entities.plan) ''' rule run_pubmed_entities: input: - file="corpora/microbes-2019/batch/{B}/batch.xml", - dummy=("corpora/microbes-2019/batch/{B}/bionlp-st"), - xslt="corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt" + file="corpora/pubmed/batches/{B}/batch.xml", + dummy=("corpora/pubmed/batches/{B}/bionlp-st"), + xslt="corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt" output: - relations="corpora/microbes-2019/batch/{B}/relations.txt", - phenotypeRelations="corpora/microbes-2019/batch/{B}/phenotype-relations.txt", - usesRelations="corpora/microbes-2019/batch/{B}/uses-relations.txt", - microorganisms="corpora/microbes-2019/batch/{B}/microorganisms.txt", - habitats="corpora/microbes-2019/batch/{B}/habitats.txt", - phenotypes="corpora/microbes-2019/batch/{B}/phenotypes.txt", - uses="corpora/microbes-2019/batch/{B}/uses.txt", - index=directory("corpora/microbes-2019/batch/{B}/index") - log:"corpora/microbes-2019/batch/{B}/alvisnlp.log" + relations="corpora/pubmed/batches/{B}/relations.txt", + phenotypeRelations="corpora/pubmed/batches/{B}/phenotype-relations.txt", + usesRelations="corpora/pubmed/batches/{B}/uses-relations.txt", + microorganisms="corpora/pubmed/batches/{B}/microorganisms.txt", + habitats="corpora/pubmed/batches/{B}/habitats.txt", + phenotypes="corpora/pubmed/batches/{B}/phenotypes.txt", + uses="corpora/pubmed/batches/{B}/uses.txt", + index=directory("corpora/pubmed/batches/{B}/index") + log:"corpora/pubmed/batches/{B}/alvisnlp.log" params: batch="{B}", - corpus='microbes-2019', + corpus='pubmed', inhibitSyntax='inhibit-syntax', onto='ancillaries/BioNLP-OST+EnovFood', ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', - dir='corpora/microbes-2019/batch/{B}/', + dir='corpora/pubmed/batches/{B}/', taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] @@ -96,9 +97,9 @@ for ''' rule concat_results: input: - expand("corpora/microbes-2019/batch/{B}/{{R}}.txt", B=BATCHES) + expand("corpora/pubmed/batches/{B}/{{R}}.txt", B=BATCHES) output: - result="corpora/microbes-2019/{R}.full.txt" + result="corpora/pubmed/{R}.full.txt" run: with open(output.result, 'w') as out: for fname in input: @@ -112,9 +113,9 @@ merge indexes from the batches ''' rule merge_pubmed_index: input: - index=expand("corpora/microbes-2019/batch/{B}/index", B=BATCHES) + index=expand("corpora/pubmed/batches/{B}/index", B=BATCHES) output: - index_folder=directory("corpora/microbes-2019/index") + index_folder=directory("corpora/pubmed/index") params: alvisir=config["ALVISIR_HOME"] shell: """ @@ -136,7 +137,7 @@ rule create_pubmed_expander: onto_phenotype="ancillaries/BioNLP-OST+EnovFood-Phenotype.obo", onto_use="ancillaries/Use_V2.obo" output: - expander_folder=directory("corpora/microbes-2019/expander") + expander_folder=directory("corpora/pubmed/expander") params: alvisir=config["ALVISIR_HOME"] shell:""" @@ -150,7 +151,7 @@ integration in Florilege ''' rule format_pubmed_relations: input: - file="corpora/microbes-2019/relations.full.txt" + file="corpora/pubmed/relations.full.txt" output: florilege_result="corpora/florilege/pubmed/PubMed-Habitat.txt" conda: 'softwares/envs/python3_env.yaml' @@ -167,7 +168,7 @@ integration in Florilege ''' rule format_pubmed_phenotype_relations: input: - file="corpora/microbes-2019/phenotype-relations.full.txt" + file="corpora/pubmed/phenotype-relations.full.txt" output: florilege_result="corpora/florilege/pubmed/PubMed-Phenotype.txt" conda: 'softwares/envs/python3_env.yaml' @@ -183,7 +184,7 @@ integration in Florilege ''' rule format_pubmed_use_relations: input: - file="corpora/microbes-2019/uses-relations.full.txt" + file="corpora/pubmed/uses-relations.full.txt" output: florilege_result="corpora/florilege/pubmed/PubMed-Use.txt" conda: 'softwares/envs/python3_env.yaml' -- GitLab