From 9152896d320959e99f54114a0860ff3c1e14cb04 Mon Sep 17 00:00:00 2001
From: Mouhamadou Ba <mouhamadou.ba@inra.fr>
Date: Tue, 6 Apr 2021 11:58:16 +0200
Subject: [PATCH] harmonize folders

---
 .../batch/0001/batch.xml                      |  0
 .../batch/0007/batch.xml                      |  0
 .../microbes-2019-pubmed2alvisnlp.xslt        |  0
 docs/6-process-pubmed-data.md                 |  8 +--
 docs/run.md                                   |  4 +-
 docs/stats.md                                 | 16 ++---
 generate_stats.snakefile                      | 38 +++++-----
 plans/entities.plan                           | 70 +++++++++----------
 plans/tomap-habitats.plan                     | 12 ++--
 plans/tomap-microbial-phenotypes.plan         | 12 ++--
 process-evaluate_BioNLP-OST.snakefile         | 38 +++++-----
 process_PubMed_corpus.snakefile               | 55 ++++++++-------
 12 files changed, 127 insertions(+), 126 deletions(-)
 rename corpora/{microbes-2019 => pubmed}/batch/0001/batch.xml (100%)
 rename corpora/{microbes-2019 => pubmed}/batch/0007/batch.xml (100%)
 rename corpora/{microbes-2019 => pubmed}/microbes-2019-pubmed2alvisnlp.xslt (100%)

diff --git a/corpora/microbes-2019/batch/0001/batch.xml b/corpora/pubmed/batch/0001/batch.xml
similarity index 100%
rename from corpora/microbes-2019/batch/0001/batch.xml
rename to corpora/pubmed/batch/0001/batch.xml
diff --git a/corpora/microbes-2019/batch/0007/batch.xml b/corpora/pubmed/batch/0007/batch.xml
similarity index 100%
rename from corpora/microbes-2019/batch/0007/batch.xml
rename to corpora/pubmed/batch/0007/batch.xml
diff --git a/corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt b/corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt
similarity index 100%
rename from corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt
rename to corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt
diff --git a/docs/6-process-pubmed-data.md b/docs/6-process-pubmed-data.md
index 71a0e2b2..7271d1d3 100644
--- a/docs/6-process-pubmed-data.md
+++ b/docs/6-process-pubmed-data.md
@@ -1,6 +1,6 @@
 ## About
 The pipeline extracts microorganisms, habitats of texts from Pubmed. The workflow uses alvisnlp plan `plans/entities.plan`. 
-Pubmed corpus is split into several batches that are available into `corpora/microbes-2019/batch`. 
+Pubmed corpus is split into several batches that are available into `corpora/pubmed/batches`. 
 The bacthes are automatically scanned by the pipeline.
  
  <img align="right" width="800" src="6-pipeline.svg">
@@ -56,13 +56,13 @@ The pipeline relies on the following alvisnlp plan:
 
 The pipeline handles the following resources :
 * inputs
-    * `corpora/microbes-2019`
+    * `corpora/pubmed`
     * `ancillaries/OntoBiotope_BioNLP-OST-2019-Habitat.obo`
     * `ancillaries/OntoBiotope_BioNLP-OST-2019-Phenotype.obo`
     * `ancillaries/Use_V2.obo`
 * outputs
-	* `corpora/microbes-2019/expander`
-	* `corpora/microbes-2019/index`
+	* `corpora/pubmed/expander`
+	* `corpora/pubmed/index`
 	* `ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt`
 	* `ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt`
 * programs
diff --git a/docs/run.md b/docs/run.md
index 9f9bf62f..7d59c9ca 100644
--- a/docs/run.md
+++ b/docs/run.md
@@ -15,8 +15,8 @@ conda activate snakemake-5.13.0-env
 * CIRM corpus is processed in **step 3.**, the corpus and results are localized into the `corpora/cirm` folder.
 * GenBank corpus is processed in **step 4.**, the corpus and results are localized into the `corpora/genbank` folder. 
 * DSMZ corpus is processed in **step 5.**, the corpus and results are localized into the `corpora/dsmz` folder.
-* Pubmed corpus is processed in **step 6.**, the corpus and results are localized into the `corpora/microbes-2019` folder. 
-Pubmed corpus is to split into several batches to put in the `corpora/microbes-2019/batch` folder. 
+* Pubmed corpus is processed in **step 6.**, the corpus and results are localized into the `corpora/pubmed` folder. 
+Pubmed corpus is to split into several batches to put in the `corpora/pubmed/batches` folder. 
 
 ## Expander
 
diff --git a/docs/stats.md b/docs/stats.md
index 81008c60..e9eae40e 100644
--- a/docs/stats.md
+++ b/docs/stats.md
@@ -2,14 +2,14 @@
 
 | SOURCE | LABEL | COUNT |
 | -------- |-------- |-------- |
-|  Pubmed | nb articles | count_files(corpora/microbes-2019/batch/*) x 1000  |
-|  Pubmed | nb habitats | count_lines(corpora/microbes-2019/habitats.full.txt)  |      
-|  Pubmed | nb relations | count_lines(corpora/microbes-2019/relations.full.txt)  |
-|  pubmed | nb microorganisms | count_lines(corpora/microbes-2019/microorganisms.full.txt)  |   
-|  Pubmed | nb uses | count_lines(corpora/microbes-2019/uses.full.txt)  |
-|  Pubmed | nb phenotype-relations | count_lines(corpora/microbes-2019/phenotype-relations.full.txt)  |
-|  Pubmed | nb uses-relations | count_lines(corpora/microbes-2019/uses-relations.full.txt)  |
-|  Pubmed | nb phenotypes | count_lines(corpora/microbes-2019/phenotypes.full.txt)  |
+|  Pubmed | nb articles | count_files(corpora/pubmed/batches/*) x 1000  |
+|  Pubmed | nb habitats | count_lines(corpora/pubmed/habitats.full.txt)  |      
+|  Pubmed | nb relations | count_lines(corpora/pubmed/relations.full.txt)  |
+|  pubmed | nb microorganisms | count_lines(corpora/pubmed/microorganisms.full.txt)  |   
+|  Pubmed | nb uses | count_lines(corpora/pubmed/uses.full.txt)  |
+|  Pubmed | nb phenotype-relations | count_lines(corpora/pubmed/phenotype-relations.full.txt)  |
+|  Pubmed | nb uses-relations | count_lines(corpora/pubmed/uses-relations.full.txt)  |
+|  Pubmed | nb phenotypes | count_lines(corpora/pubmed/phenotypes.full.txt)  |
 |  cirm | nb entrees | count_lines(corpora/cirm/2019-07-05/extraction_3-fv.csv)  |
 |  cirm | nb yeast entrees | count_lines(corpora/cirm/Levures_2017/data_CIRM_levures_extraction_09032017.csv)  |
 |  cirm | nb entites | count_lines(corpora/cirm/mapped_taxids.txt)  |
diff --git a/generate_stats.snakefile b/generate_stats.snakefile
index 8504c5d3..40c76264 100644
--- a/generate_stats.snakefile
+++ b/generate_stats.snakefile
@@ -10,7 +10,7 @@ rule all:
 
 
 
-SOURCES=["cirm", "genbank", "dsmz", "microbes-2019", "BioNLP-OST-2019"]
+SOURCES=["cirm", "genbank", "dsmz", "pubmed", "BioNLP-OST-2019"]
 
 '''
 cirm | nb entrees | count_lines(corpora/cirm/BIA_2021/florilege_export_final_17_02_21.xlsx)
@@ -165,14 +165,14 @@ rule merge_stats_dsmz:
 
 
 '''
-Pubmed | nb articles | count_files(corpora/microbes-2019/batch/*) x 1000
-Pubmed | nb habitats | count_lines(corpora/microbes-2019/habitats.full.txt)             
-Pubmed | nb relations | count_lines(corpora/microbes-2019/relations.full.txt)
-pubmed | nb microorganisms | count_lines(corpora/microbes-2019/microorganisms.full.txt)     
-Pubmed | nb uses | count_lines(corpora/microbes-2019/uses.full.txt)
-Pubmed | nb phenotype-relations | count_lines(corpora/microbes-2019/phenotype-relations.full.txt)
-Pubmed | nb uses-relations | count_lines(corpora/microbes-2019/uses-relations.full.txt)
-Pubmed | nb phenotypes | count_lines(corpora/microbes-2019/phenotypes.full.txt)
+Pubmed | nb articles | count_files(corpora/pubmed/batches/*) x 1000
+Pubmed | nb habitats | count_lines(corpora/pubmed/habitats.full.txt)             
+Pubmed | nb relations | count_lines(corpora/pubmed/relations.full.txt)
+pubmed | nb microorganisms | count_lines(corpora/pubmed/microorganisms.full.txt)     
+Pubmed | nb uses | count_lines(corpora/pubmed/uses.full.txt)
+Pubmed | nb phenotype-relations | count_lines(corpora/pubmed/phenotype-relations.full.txt)
+Pubmed | nb uses-relations | count_lines(corpora/pubmed/uses-relations.full.txt)
+Pubmed | nb phenotypes | count_lines(corpora/pubmed/phenotypes.full.txt)
 '''
 ENTREES_PUBMED = ["list_of_batches.txt"]
 SORTIES_PUBMED = ["relations.full.txt", "phenotype-relations.full.txt", "uses-relations.full.txt", "microorganisms.full.txt", "habitats.full.txt", "phenotypes.full.txt", "uses.full.txt"]
@@ -184,11 +184,11 @@ FILES_PUBMED = ENTREES_PUBMED + SORTIES_PUBMED
 
 rule stats_pubmed:
 	input:
-		file="corpora/microbes-2019/{file}"
+		file="corpora/pubmed/{file}"
 	output:
-		stats="corpora/microbes-2019/stats/{file}_stats.csv"
+		stats="corpora/pubmed/stats/{file}_stats.csv"
 	params:
-		result="microbes-2019/{file}",
+		result="pubmed/{file}",
 		c0="source",
 		v0="pubmed",
 		c1="uri",
@@ -205,9 +205,9 @@ merge
 '''
 rule merge_stats_pubmed:
 	input:
-		files=expand("corpora/microbes-2019/stats/{file}_stats.csv", file=FILES_PUBMED)
+		files=expand("corpora/pubmed/stats/{file}_stats.csv", file=FILES_PUBMED)
 	output:
-		result="corpora/microbes-2019/stats/stats.full.csv"
+		result="corpora/pubmed/stats/stats.full.csv"
 	run:
 		import pandas
 		frames = [ pandas.read_csv(f) for f in input.files ]
@@ -229,8 +229,8 @@ eval_BB19-rel+ner_003, score sur la prÃ©diction des Exhibits de BB19-rel+ner,Bio
 eval_BB19-kb+ner_001, mesure pour l'evaluation de BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner/eval.json#eval_BB19-kb+ner_001#eval_BB19-kb+ner_002
 eval_BB19-kb+ner_002, score moyen sur BB19-kb+ner,BioNLP-OST-2019/BB19-kb+ner/eval.json#eval_BB19-kb+ner_002
 '''
-ENTREES_EVAL = ["BioNLP-OST-2019/batch/BB19-norm+ner", "BioNLP-OST-2019/batch/BB19-rel+ner", "BioNLP-OST-2019/batch/BB19-kb+ner"]
-SORTIES_EVAL = ["BioNLP-OST-2019/batch/BB19-norm+ner/eval.json", "BioNLP-OST-2019/batch/BB19-rel+ner/eval.json", "BioNLP-OST-2019/batch/BB19-kb+ner/eval.json"]
+ENTREES_EVAL = ["BioNLP-OST-2019/batches/BB19-norm+ner", "BioNLP-OST-2019/batches/BB19-rel+ner", "BioNLP-OST-2019/batches/BB19-kb+ner"]
+SORTIES_EVAL = ["BioNLP-OST-2019/batches/BB19-norm+ner/eval.json", "BioNLP-OST-2019/batches/BB19-rel+ner/eval.json", "BioNLP-OST-2019/batches/BB19-kb+ner/eval.json"]
 FILES_EVAL = ["BB19-norm+ner", "BB19-rel+ner", "BB19-kb+ner"]
 
 def get_score_stats(file, entity):
@@ -249,7 +249,7 @@ def get_score_stats(file, entity):
 '''
 rule stats_eval_BB19_norm:
 	input:
-		file="corpora/BioNLP-OST-2019/batch/BB19-norm+ner/eval.json"
+		file="corpora/BioNLP-OST-2019/batches/BB19-norm+ner/eval.json"
 	output:
 		stats="corpora/BioNLP-OST-2019/stats/BB19-norm+ner_stats.csv"
 	params:
@@ -274,7 +274,7 @@ rule stats_eval_BB19_norm:
 '''
 rule stats_eval_BB19_rel:
 	input:
-		file="corpora/BioNLP-OST-2019/batch/BB19-rel+ner/eval.json"
+		file="corpora/BioNLP-OST-2019/batches/BB19-rel+ner/eval.json"
 	output:
 		stats="corpora/BioNLP-OST-2019/stats/BB19-rel+ner_stats.csv"
 	params:
@@ -299,7 +299,7 @@ rule stats_eval_BB19_rel:
 '''
 rule stats_eval_BB19_kb:
 	input:
-		file="corpora/BioNLP-OST-2019/batch/BB19-kb+ner/eval.json"
+		file="corpora/BioNLP-OST-2019/batches/BB19-kb+ner/eval.json"
 	output:
 		stats="corpora/BioNLP-OST-2019/stats/BB19-kb+ner_stats.csv"
 	params:
diff --git a/plans/entities.plan b/plans/entities.plan
index 80138040..9eef3d8f 100644
--- a/plans/entities.plan
+++ b/plans/entities.plan
@@ -195,14 +195,14 @@
 
   <read>
     <pubmed class="XMLReader">
-      <sourcePath>corpora/&corpus;/batch/&batch;/batch.xml</sourcePath>
+      <sourcePath>corpora/&corpus;/batches/&batch;/batch.xml</sourcePath>
       <xslTransform>ancillaries/&corpus;-pubmed2alvisnlp.xslt</xslTransform>
     </pubmed>
 
     <bionlp-st class="BioNLPSTReader">
       <active>true</active>
       <sectionName>abstract</sectionName>
-      <textDir>corpora/&corpus;/batch/&batch;/bionlp-st</textDir>
+      <textDir>corpora/&corpus;/batches/&batch;/bionlp-st</textDir>
     </bionlp-st>
   </read>
 
@@ -407,7 +407,7 @@
     <!-- Run Yatea term extractor -->
     <yatea class="YateaExtractor">
       <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
-      <xmlTermsFile>corpora/&corpus;/batch/&batch;/yatea/candidates.xml</xmlTermsFile>
+      <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea/candidates.xml</xmlTermsFile>
       <posFeature>tt_pos</posFeature>
       <configDir>ancillaries/YaTeA/config-habitats</configDir>
       <localeDir>ancillaries/YaTeA/locale</localeDir>
@@ -418,7 +418,7 @@
     <!-- Run Yatea term extractor on variants -->
     <yatea-var class="YateaExtractor">
       <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter>
-      <xmlTermsFile>corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</xmlTermsFile>
+      <xmlTermsFile>corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</xmlTermsFile>
       <posFeature>tt_pos</posFeature>
       <lemmaFeature>variant</lemmaFeature>
       <configDir>ancillaries/YaTeA/config-habitats</configDir>
@@ -672,7 +672,7 @@
 
   <output>
     <doc-mesh class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"doc-mesh.txt"</fileName>
       <lines>documents.sections:mesh</lines>
@@ -686,7 +686,7 @@
     </doc-mesh>
 
     <taxa class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"taxa.txt"</fileName>
       <lines>documents.sections.layer:taxa</lines>
@@ -705,7 +705,7 @@
     </taxa>
 
     <microorganisms class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"microorganisms.txt"</fileName>
       <lines>documents.sections.layer:microorganism</lines>
@@ -724,7 +724,7 @@
     </microorganisms>
 
     <microorganisms-short class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"microorganisms-short.txt"</fileName>
       <lines>documents.sections.layer:microorganism[outside:words and not @form == outside:words.@form]</lines>
@@ -743,7 +743,7 @@
     </microorganisms-short>
 
     <bacteria class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"bacteria.txt"</fileName>
       <lines>documents.sections.layer:bacteria</lines>
@@ -762,7 +762,7 @@
     </bacteria>
 
     <habitats class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"habitats.txt"</fileName>
       <lines>documents.sections.layer:habitats</lines>
@@ -783,7 +783,7 @@
     </habitats>
 
     <phenotypes class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"phenotypes.txt"</fileName>
       <lines>documents.sections.layer:phenotypes</lines>
@@ -804,7 +804,7 @@
     </phenotypes>
 
     <uses class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"uses.txt"</fileName>
       <lines>documents.sections.layer:uses</lines>
@@ -825,7 +825,7 @@
     </uses>
 
     <geo class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"geo.txt"</fileName>
       <lines>documents.sections.layer:Geographical</lines>
@@ -840,7 +840,7 @@
     </geo>
 
     <relations class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"relations.txt"</fileName>
       <lines>documents.sections.relations:CooccurrenceLocalization.tuples</lines>
@@ -863,7 +863,7 @@
     </relations>
 
     <relations-pheno class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"phenotype-relations.txt"</fileName>
       <lines>documents.sections.relations:PhenotypeRelation.tuples</lines>
@@ -886,7 +886,7 @@
     </relations-pheno>
 
     <relations-use class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;</outDir>
       <files>$</files>
       <fileName>"uses-relations.txt"</fileName>
       <lines>documents.sections.relations:UseRelation.tuples</lines>
@@ -909,7 +909,7 @@
     </relations-use>
 
     <!-- <rdf class="RDFExport"> -->
-    <!--   <outDir>corpora/&corpus;/batch/&batch;</outDir> -->
+    <!--   <outDir>corpora/&corpus;/batches/&batch;</outDir> -->
     <!--   <files>$</files> -->
     <!--   <fileName>"&batch;.ttl"</fileName> -->
     <!--   <format>turtle</format> -->
@@ -941,7 +941,7 @@
 
     <!--
 	<module id="document-richness" class="TabularExport">
-	<outDir>corpora/&corpus;/batch/&batch;</outDir>
+	<outDir>corpora/&corpus;/batches/&batch;</outDir>
 	<files>$</files>
 	<fileName>"document-richness.txt"</fileName>
 	<lines>documents[sections[layer:microorganism and layer:habitats]]</lines>
@@ -977,7 +977,7 @@
       </index-sentences>
 
       <sentences class="TabularExport">
-	<outDir>corpora/&corpus;/batch/&batch;</outDir>
+	<outDir>corpora/&corpus;/batches/&batch;</outDir>
 	<files>$</files>
 	<fileName>"sentences.txt"</fileName>
 	<lines>documents.sections.layer:sentences[@name != "author"]</lines>
@@ -998,7 +998,7 @@
       </sentences>
 
       <anaphora class="TabularExport">
-	<outDir>corpora/&corpus;/batch/&batch;</outDir>
+	<outDir>corpora/&corpus;/batches/&batch;</outDir>
 	<files>$</files>
 	<fileName>"anaphora.txt"</fileName>
 	<lines>documents.sections.relations:coreferences.tuples[args:Ante]</lines>
@@ -1031,7 +1031,7 @@
       </anaphora>
 
       <dependencies class="TabularExport">
-	<outDir>corpora/&corpus;/batch/&batch;</outDir>
+	<outDir>corpora/&corpus;/batches/&batch;</outDir>
 	<files>$</files>
 	<fileName>"dependencies.txt"</fileName>
 	<lines>documents.sections[@name != "author"].relations:dependencies.tuples</lines>
@@ -1163,7 +1163,7 @@
 
   <!--   <module id="annotation" class="ExportCadixeJSON"> -->
   <!--     <sectionFilter>@name == "title" or @name == "abstract"</sectionFilter> -->
-  <!--     <outDir>corpora/&corpus;/batch/&batch;/json/annotation</outDir> -->
+  <!--     <outDir>corpora/&corpus;/batches/&batch;/json/annotation</outDir> -->
   <!--     <documentDescription>"[" ^ @id ^ "] " ^ sections:title.contents</documentDescription> -->
   <!--     <documentProperties>DocumentID=@id</documentProperties> -->
   <!--     <annotationSets> -->
@@ -1250,7 +1250,7 @@
     </habitat-ancestors>
     
     <index class="AlvisDBIndexer">
-      <indexDir>corpora/&corpus;/batch/&batch;/adb</indexDir>
+      <indexDir>corpora/&corpus;/batches/&batch;/adb</indexDir>
       <elements>
 	<relations>
 	  <items>documents.sections.relations:CooccurrenceLocalization.tuples[args:Bacterium[@bacteria == "true"]]</items>
@@ -1270,7 +1270,7 @@
   </adb>
 
   <index class="AlvisIRIndexer">
-    <indexDir>corpora/&corpus;/batch/&batch;/index</indexDir>
+    <indexDir>corpora/&corpus;/batches/&batch;/index</indexDir>
     <tokenPositionGap>9216</tokenPositionGap>
     <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames>
     <relations>
@@ -1408,7 +1408,7 @@
   </index>
 
   <index-food class="AlvisIRIndexer">
-    <indexDir>corpora/&corpus;/batch/&batch;/index-food</indexDir>
+    <indexDir>corpora/&corpus;/batches/&batch;/index-food</indexDir>
     <tokenPositionGap>9216</tokenPositionGap>
     <fieldNames>title,abstract,author,full-author,pmid,year,journal,mesh,url</fieldNames>
     <relations>
@@ -1566,14 +1566,14 @@
   </add-feature3>
   <html class="QuickHTML">
     <active>false</active>
-    <outDir>corpora/&corpus;/batch/&batch;/html</outDir>
+    <outDir>corpora/&corpus;/batches/&batch;/html</outDir>
     <classFeature>ne-type</classFeature>
     <layers>phenotypes,microorganism,habitats</layers>
     <colors>#99cc00,#ffcc99,#ffd333,#ffd666</colors>
   </html>
 
   <words class="TabularExport">
-    <outDir>corpora/&corpus;/batch/&batch;</outDir>
+    <outDir>corpora/&corpus;/batches/&batch;</outDir>
     <files>$</files>
     <fileName>"words.txt"</fileName>
     <lines>documents.sections[@name == "title" or @name == "abstract"].layer:words</lines>
@@ -1585,7 +1585,7 @@
 
   <bionlp-st-a2>
     <habitats class="TabularExport">
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>layer:habitats</lines>
@@ -1598,7 +1598,7 @@
     
     <phenotypes class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>layer:phenotypes</lines>
@@ -1611,7 +1611,7 @@
     
     <microorganisms class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>layer:microorganism</lines>
@@ -1624,7 +1624,7 @@
 
     <obt class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>layer:habitats|layer:phenotypes</lines>
@@ -1636,7 +1636,7 @@
 
     <taxid class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>layer:microorganism</lines>
@@ -1648,7 +1648,7 @@
 
     <lives-in class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>relations:CooccurrenceLocalization.tuples[args:Localization.@concept-id != ""]</lines>
@@ -1660,7 +1660,7 @@
 
     <exhibits class="TabularExport">
       <append/>
-      <outDir>corpora/&corpus;/batch/&batch;/a2</outDir>
+      <outDir>corpora/&corpus;/batches/&batch;/a2</outDir>
       <files>documents.sections</files>
       <fileName>document.@id ^ ".a2"</fileName>
       <lines>relations:PhenotypeRelation.tuples</lines>
@@ -1673,7 +1673,7 @@
 
 
   <success class="TabularExport">
-    <outDir>corpora/&corpus;/batch/&batch;</outDir>
+    <outDir>corpora/&corpus;/batches/&batch;</outDir>
     <files>$</files>
     <fileName>"success.txt"</fileName>
     <lines>documents</lines>
diff --git a/plans/tomap-habitats.plan b/plans/tomap-habitats.plan
index d6abf84a..e5dcec1c 100644
--- a/plans/tomap-habitats.plan
+++ b/plans/tomap-habitats.plan
@@ -3,7 +3,7 @@
 
   <!-- ToMap on lemmas -->
   <tomap class="TomapProjector">
-    <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+    <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
     <targetLayerName>habitats</targetLayerName>
     <conceptFeature>concept-id</conceptFeature>
     <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -26,7 +26,7 @@
   <tomap-on-alternative-lemmas>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>habitats2</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -58,7 +58,7 @@
   <tomap-no-lemmakeys>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>habitats3</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -89,7 +89,7 @@
   <tomap-on-variants>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile>
       <targetLayerName>habitats4</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -121,7 +121,7 @@
   <tomap-no-lemmakeys-word-form>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>habitats5</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -196,7 +196,7 @@
   <bioyatea-projection class="YateaTermsProjector">
     <targetLayerName>yateaTerms</targetLayerName>
     <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>-->
-    <yateaFile output-feed="yes">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? -->
+    <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? -->
     <subject layer="words"/>
     <termLemma>lemma</termLemma>
   </bioyatea-projection>
diff --git a/plans/tomap-microbial-phenotypes.plan b/plans/tomap-microbial-phenotypes.plan
index d6b4a22f..17481643 100644
--- a/plans/tomap-microbial-phenotypes.plan
+++ b/plans/tomap-microbial-phenotypes.plan
@@ -2,7 +2,7 @@
   
   <!-- ToMap on lemmas -->
   <tomap class="TomapProjector">
-    <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+    <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
     <targetLayerName>phenotypes</targetLayerName>
     <conceptFeature>concept-id</conceptFeature>
     <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -25,7 +25,7 @@
   <tomap-on-alternative-lemmas>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>phenotypes2</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -57,7 +57,7 @@
   <tomap-no-lemmakeys>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>phenotypes3</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -89,7 +89,7 @@
   <tomap-on-variants>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea-var/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea-var/candidates.xml</yateaFile>
       <targetLayerName>phenotypes4</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -121,7 +121,7 @@
   <tomap-no-lemmakeys-word-form>
 
     <tomap class="TomapProjector">
-      <yateaFile output-feed="true">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile>
+      <yateaFile output-feed="true">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile>
       <targetLayerName>phenotypes5</targetLayerName>
       <conceptFeature>concept-id</conceptFeature>
       <explanationFeaturePrefix>explain_</explanationFeaturePrefix>
@@ -166,7 +166,7 @@
   <bioyatea-projection class="YateaTermsProjector">
     <targetLayerName>yateaTerms</targetLayerName>
     <!--<yateaFile inhibitCheck="true">words_prepro/default/xml/candidates_pp.xml</yateaFile>-->
-    <yateaFile output-feed="yes">corpora/&corpus;/batch/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? -->
+    <yateaFile output-feed="yes">corpora/&corpus;/batches/&batch;/yatea/candidates.xml</yateaFile> <!-- ??? -->
     <subject layer="words"/>
     <termLemma>lemma</termLemma>
   </bioyatea-projection>
diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile
index e9f494ff..d3dca3e6 100644
--- a/process-evaluate_BioNLP-OST.snakefile
+++ b/process-evaluate_BioNLP-OST.snakefile
@@ -13,8 +13,8 @@ all
 '''
 rule all:
 	input:
-		log=expand("corpora/BioNLP-OST-2019/batch/{B}/eval.json", B=BATCHES),
-                scores=expand("corpora/BioNLP-OST-2019/batch/{B}/eval.json", B=BATCHES)
+		log=expand("corpora/BioNLP-OST-2019/batches/{B}/eval.json", B=BATCHES),
+                scores=expand("corpora/BioNLP-OST-2019/batches/{B}/eval.json", B=BATCHES)
 
 
 '''
@@ -23,19 +23,19 @@ batches using the alvisnlp plan (entities.plan)
 '''
 rule run_bionlp_prediction:
 	input:
-		dir=directory("corpora/BioNLP-OST-2019/batch/{B}/bionlp-st"),
-		xslt="corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt"
+		dir=directory("corpora/BioNLP-OST-2019/batches/{B}/bionlp-st"),
+		xslt="corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt"
 	output:
-		relations="corpora/BioNLP-OST-2019/batch/{B}/relations.txt",
-		phenotypeRelations="corpora/BioNLP-OST-2019/batch/{B}/phenotype-relations.txt",
-		usesRelations="corpora/BioNLP-OST-2019/batch/{B}/uses-relations.txt",
-		microorganisms="corpora/BioNLP-OST-2019/batch/{B}/microorganisms.txt",
-		habitats="corpora/BioNLP-OST-2019/batch/{B}/habitats.txt",
-		phenotypes="corpora/BioNLP-OST-2019/batch/{B}/phenotypes.txt",
-		uses="corpora/BioNLP-OST-2019/batch/{B}/uses.txt",
-		index=directory("corpora/BioNLP-OST-2019/batch/{B}/index"),
-		a2=directory("corpora/BioNLP-OST-2019/batch/{B}/a2")
-	log:"corpora/BioNLP-OST-2019/batch/{B}/alvisnlp.log"
+		relations="corpora/BioNLP-OST-2019/batches/{B}/relations.txt",
+		phenotypeRelations="corpora/BioNLP-OST-2019/batches/{B}/phenotype-relations.txt",
+		usesRelations="corpora/BioNLP-OST-2019/batches/{B}/uses-relations.txt",
+		microorganisms="corpora/BioNLP-OST-2019/batches/{B}/microorganisms.txt",
+		habitats="corpora/BioNLP-OST-2019/batches/{B}/habitats.txt",
+		phenotypes="corpora/BioNLP-OST-2019/batches/{B}/phenotypes.txt",
+		uses="corpora/BioNLP-OST-2019/batches/{B}/uses.txt",
+		index=directory("corpora/BioNLP-OST-2019/batches/{B}/index"),
+		a2=directory("corpora/BioNLP-OST-2019/batches/{B}/a2")
+	log:"corpora/BioNLP-OST-2019/batches/{B}/alvisnlp.log"
 	params:
 		batch="{B}",
 		corpus='BioNLP-OST-2019',
@@ -43,7 +43,7 @@ rule run_bionlp_prediction:
 		onto='ancillaries/BioNLP-OST+EnovFood',
                 ontobiotopeUse='ancillaries/Use_V2',
 		plan='plans/entities.plan',
-		dir='corpora/BioNLP-OST-2019/batch/{B}/',
+		dir='corpora/BioNLP-OST-2019/batches/{B}/',
 		taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt',
                 taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt'
 	singularity:config["SINGULARITY_IMG"]
@@ -67,18 +67,18 @@ rule run_bionlp_prediction:
 
 rule archive_prediction:
     input:
-        a2=directory("corpora/BioNLP-OST-2019/batch/{B}/a2")
+        a2=directory("corpora/BioNLP-OST-2019/batches/{B}/a2")
     output:
-        zip="corpora/BioNLP-OST-2019/batch/{B}/predictions.zip"
+        zip="corpora/BioNLP-OST-2019/batches/{B}/predictions.zip"
     shell:
         """zip -9 {output.zip} {input.a2}/*.a2"""
 
         
 rule evaluate:
     input:
-        zip="corpora/BioNLP-OST-2019/batch/{B}/predictions.zip"
+        zip="corpora/BioNLP-OST-2019/batches/{B}/predictions.zip"
     output:
-        scores="corpora/BioNLP-OST-2019/batch/{B}/eval.json"
+        scores="corpora/BioNLP-OST-2019/batches/{B}/eval.json"
     params:
         api=config["BIONLPOST_API"],
         task="{B}"
diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile
index b5a20ac3..16c8bef8 100644
--- a/process_PubMed_corpus.snakefile
+++ b/process_PubMed_corpus.snakefile
@@ -1,5 +1,6 @@
 ## local rule
 # localrules: all, concat_results
+localrules:create_dummy_bionlp_st_dir
 
 ## config file
 configfile: "config/config.yaml"
@@ -16,10 +17,10 @@ all
 '''
 rule all:
 	input:
-		results=expand("corpora/microbes-2019/{R}.full.txt", R=RESULTS),
-		log=expand("corpora/microbes-2019/batch/{B}/alvisnlp.log", B=BATCHES),
-		index_folder="corpora/microbes-2019/index",
-		expander_folder="corpora/microbes-2019/expander",
+		results=expand("corpora/pubmed/{R}.full.txt", R=RESULTS),
+		log=expand("corpora/pubmed/batches/{B}/alvisnlp.log", B=BATCHES),
+		index_folder="corpora/pubmed/index",
+		expander_folder="corpora/pubmed/expander",
 		florilege_Habitat_result="corpora/florilege/pubmed/PubMed-Habitat.txt",
 		florilege_Phenotype_result="corpora/florilege/pubmed/PubMed-Phenotype.txt",
 		florilege_Use_result="corpora/florilege/pubmed/PubMed-Use.txt"
@@ -27,7 +28,7 @@ rule all:
 
 rule create_dummy_bionlp_st_dir:
     output:
-        dummy=directory("corpora/microbes-2019/batch/{B}/bionlp-st")
+        dummy=directory("corpora/pubmed/batches/{B}/bionlp-st")
     shell:
         '''mkdir -p {output.dummy}'''
 
@@ -38,27 +39,27 @@ batches using the alvisnlp plan (entities.plan)
 '''
 rule run_pubmed_entities:
 	input:
-		file="corpora/microbes-2019/batch/{B}/batch.xml",
-		dummy=("corpora/microbes-2019/batch/{B}/bionlp-st"),
-		xslt="corpora/microbes-2019/microbes-2019-pubmed2alvisnlp.xslt"
+		file="corpora/pubmed/batches/{B}/batch.xml",
+		dummy=("corpora/pubmed/batches/{B}/bionlp-st"),
+		xslt="corpora/pubmed/microbes-2019-pubmed2alvisnlp.xslt"
 	output:
-		relations="corpora/microbes-2019/batch/{B}/relations.txt",
-		phenotypeRelations="corpora/microbes-2019/batch/{B}/phenotype-relations.txt",
-		usesRelations="corpora/microbes-2019/batch/{B}/uses-relations.txt",
-		microorganisms="corpora/microbes-2019/batch/{B}/microorganisms.txt",
-		habitats="corpora/microbes-2019/batch/{B}/habitats.txt",
-		phenotypes="corpora/microbes-2019/batch/{B}/phenotypes.txt",
-		uses="corpora/microbes-2019/batch/{B}/uses.txt",
-		index=directory("corpora/microbes-2019/batch/{B}/index")
-	log:"corpora/microbes-2019/batch/{B}/alvisnlp.log"
+		relations="corpora/pubmed/batches/{B}/relations.txt",
+		phenotypeRelations="corpora/pubmed/batches/{B}/phenotype-relations.txt",
+		usesRelations="corpora/pubmed/batches/{B}/uses-relations.txt",
+		microorganisms="corpora/pubmed/batches/{B}/microorganisms.txt",
+		habitats="corpora/pubmed/batches/{B}/habitats.txt",
+		phenotypes="corpora/pubmed/batches/{B}/phenotypes.txt",
+		uses="corpora/pubmed/batches/{B}/uses.txt",
+		index=directory("corpora/pubmed/batches/{B}/index")
+	log:"corpora/pubmed/batches/{B}/alvisnlp.log"
 	params:
 		batch="{B}",
-		corpus='microbes-2019',
+		corpus='pubmed',
                 inhibitSyntax='inhibit-syntax',
 		onto='ancillaries/BioNLP-OST+EnovFood',
                 ontobiotopeUse='ancillaries/Use_V2',
 		plan='plans/entities.plan',
-		dir='corpora/microbes-2019/batch/{B}/',
+		dir='corpora/pubmed/batches/{B}/',
 		taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt',
                 taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt'
 	singularity:config["SINGULARITY_IMG"]
@@ -96,9 +97,9 @@ for
 '''
 rule concat_results:
 	input: 
-		expand("corpora/microbes-2019/batch/{B}/{{R}}.txt", B=BATCHES)
+		expand("corpora/pubmed/batches/{B}/{{R}}.txt", B=BATCHES)
 	output:
-		result="corpora/microbes-2019/{R}.full.txt"
+		result="corpora/pubmed/{R}.full.txt"
 	run:
 		with open(output.result, 'w') as out:
 			for fname in input:
@@ -112,9 +113,9 @@ merge indexes from the batches
 '''
 rule merge_pubmed_index:
 	input:
-		index=expand("corpora/microbes-2019/batch/{B}/index", B=BATCHES)
+		index=expand("corpora/pubmed/batches/{B}/index", B=BATCHES)
 	output:
-		index_folder=directory("corpora/microbes-2019/index")
+		index_folder=directory("corpora/pubmed/index")
 	params:
 		alvisir=config["ALVISIR_HOME"]
 	shell: """
@@ -136,7 +137,7 @@ rule create_pubmed_expander:
 		onto_phenotype="ancillaries/BioNLP-OST+EnovFood-Phenotype.obo",
 		onto_use="ancillaries/Use_V2.obo"
 	output:
-		expander_folder=directory("corpora/microbes-2019/expander")
+		expander_folder=directory("corpora/pubmed/expander")
 	params:
 		alvisir=config["ALVISIR_HOME"]
 	shell:"""
@@ -150,7 +151,7 @@ integration in Florilege
 '''
 rule format_pubmed_relations:
 	input:
-		file="corpora/microbes-2019/relations.full.txt"
+		file="corpora/pubmed/relations.full.txt"
 	output:
 		florilege_result="corpora/florilege/pubmed/PubMed-Habitat.txt"
 	conda: 'softwares/envs/python3_env.yaml'
@@ -167,7 +168,7 @@ integration in Florilege
 '''
 rule format_pubmed_phenotype_relations:
 	input:
-		file="corpora/microbes-2019/phenotype-relations.full.txt"
+		file="corpora/pubmed/phenotype-relations.full.txt"
 	output:
 		florilege_result="corpora/florilege/pubmed/PubMed-Phenotype.txt"
 	conda: 'softwares/envs/python3_env.yaml'
@@ -183,7 +184,7 @@ integration in Florilege
 '''
 rule format_pubmed_use_relations:
 	input:
-		file="corpora/microbes-2019/uses-relations.full.txt"
+		file="corpora/pubmed/uses-relations.full.txt"
 	output:
 		florilege_result="corpora/florilege/pubmed/PubMed-Use.txt"
 	conda: 'softwares/envs/python3_env.yaml'
-- 
GitLab