Skip to content
Snippets Groups Projects
Snakefile 28.8 KiB
Newer Older
Alexis Mergez's avatar
Alexis Mergez committed
configfile: "config.yaml"

include: "rules/tools.smk"
Alexis Mergez's avatar
Alexis Mergez committed
## Modules
Alexis Mergez's avatar
Alexis Mergez committed
import os
Alexis Mergez's avatar
Alexis Mergez committed
import numpy as np
import gzip
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
"""
Main variables used in the workflow ---------------------------------------------------------------
Alexis Mergez's avatar
Alexis Mergez committed
"""

# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes.
# File with .fa extension only will be used. (.fna will not be used)
SAMPLES = np.unique([
    os.path.basename(file).split('.fa')[0] 
    for file in os.listdir("data/haplotypes/")
    if re.search(r"\.fa", file)
Alexis Mergez's avatar
Alexis Mergez committed

# Retrieving the list of haplotypes excluding the reference
Alexis Mergez's avatar
Alexis Mergez committed
SAMPLES_NOREF = [
    sample 
    for sample in SAMPLES 
    if sample != os.path.basename(config['reference']).split('.fa')[0]
    ]
Alexis Mergez's avatar
Alexis Mergez committed

# Storing the number of haplotypes
Alexis Mergez's avatar
Alexis Mergez committed
nHAP = len(SAMPLES)

Alexis Mergez's avatar
Alexis Mergez committed
# Retrieving the list of chromosomes from the fasta index of the reference
with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
    CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]

# Adding optionnal output based on config.yaml, using the following function
Alexis Mergez's avatar
Alexis Mergez committed
def which_analysis():
    
    ## Default analysis
Alexis Mergez's avatar
Alexis Mergez committed
    analysis_inputs = [     
Alexis Mergez's avatar
Alexis Mergez committed
        "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats
        expand("output/panacus.reports/"+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth 
        expand("output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
        "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics
Alexis Mergez's avatar
Alexis Mergez committed
    ]
    ## Optionals analysis steps
Alexis Mergez's avatar
Alexis Mergez committed
    if config["get_PAV"] == "True": # Adding PAV matrix creation
Alexis Mergez's avatar
Alexis Mergez committed
        analysis_inputs.append("output/pav.matrices")
Alexis Mergez's avatar
Alexis Mergez committed
    if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly 
Alexis Mergez's avatar
Alexis Mergez committed
        analysis_inputs.append(
Alexis Mergez's avatar
Alexis Mergez committed
            expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF)
        )
    if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input
Alexis Mergez's avatar
Alexis Mergez committed
        analysis_inputs.append(
Alexis Mergez's avatar
Alexis Mergez committed
            expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", chromosome=CHRLIST)
Alexis Mergez's avatar
Alexis Mergez committed
        )
    if config["run_Quast"] == "True": # Running Quast on input haplotypes
Alexis Mergez's avatar
Alexis Mergez committed
        analysis_inputs.append(
Alexis Mergez's avatar
Alexis Mergez committed
            "output/quast/"+config['name']+".quast.report.html"
Alexis Mergez's avatar
Alexis Mergez committed
        )
    if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure
Alexis Mergez's avatar
Alexis Mergez committed
        analysis_inputs.append(
Alexis Mergez's avatar
Alexis Mergez committed
            expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) 
Alexis Mergez's avatar
Alexis Mergez committed
        )
        if config["create_report"] == "True": # Creating report (need contig)
            analysis_inputs.append(
                "output/pan1c."+config['name']+".report.md"
            )
Alexis Mergez's avatar
Alexis Mergez committed

    return analysis_inputs
Alexis Mergez's avatar
Alexis Mergez committed

"""
Rules   -------------------------------------------------------------------------------------------
Alexis Mergez's avatar
Alexis Mergez committed
"""
Alexis Mergez's avatar
Alexis Mergez committed
# Main target rule
Alexis Mergez's avatar
Alexis Mergez committed
rule all:
    input:
Alexis Mergez's avatar
Alexis Mergez committed
        "output/pan1c."+config['name']+".gfa", # Final graph (main output)
        "output/pan1c."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
Alexis Mergez's avatar
Alexis Mergez committed
        which_analysis()
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
"""
Pre-processing section : preparing pggb inputs  ---------------------------------------------------
"""
Alexis Mergez's avatar
Alexis Mergez committed
rule ragtag_scaffolding:
    # Scaffold input haplotype against the reference to infer chromosome scale sequences
    input:
        ref="data/haplotypes/"+config['reference'],
        reffai="data/haplotypes/"+config['reference']+".fai",
        refgzi="data/haplotypes/"+config['reference']+".gzi",
        fa="data/haplotypes/{haplotype}.fa.gz"
    output:
        fa=temp("data/hap.ragtagged/{haplotype}.ragtagged.fa"),
Alexis Mergez's avatar
Alexis Mergez committed
        tar="data/hap.ragtagged/{haplotype}.tar.gz"
    threads: 8
    retries: 1
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config["app.path"]
Alexis Mergez's avatar
Alexis Mergez committed
    log: 
        cmd="logs/ragtag/{haplotype}.ragtag.cmd.log",
        time="logs/ragtag/{haplotype}.ragtag.time.log"
Alexis Mergez's avatar
Alexis Mergez committed
    shell:
        """
Alexis Mergez's avatar
Alexis Mergez committed
        /usr/bin/time -v -o {log.time} \
            bash scripts/ragtagChromInfer.sh \
Alexis Mergez's avatar
Alexis Mergez committed
            -d $(dirname {output.fa})/{wildcards.haplotype} \
            -a {params.app_path} \
            -t {threads} \
            -r {input.ref} \
            -q {input.fa} \
Alexis Mergez's avatar
Alexis Mergez committed
            -o {output.fa} 2>&1 | \
            tee {log.cmd}
Alexis Mergez's avatar
Alexis Mergez committed
        #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then
        #    echo "Error : Empty final fasta"
        #    exit 1
        #fi
        """
Alexis Mergez's avatar
Alexis Mergez committed
rule quast_stats:
    # Run Quast on ragtagged genomes
    input:
        fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF),
        ref="data/haplotypes/"+config['reference']
    output:
        report="output/quast/"+config['name']+".quast.report.html"
    threads: 8
    params:
        app_path=config["app.path"],
        pan_name=config["name"]
Alexis Mergez's avatar
Alexis Mergez committed
    log: 
Alexis Mergez's avatar
Alexis Mergez committed
        cmd="logs/quast/quast.cmd.log",
        time="logs/quast/quast.time.log"
Alexis Mergez's avatar
Alexis Mergez committed
    shell:
        """
Alexis Mergez's avatar
Alexis Mergez committed
        /usr/bin/time -v -o {log.time} \
            apptainer run {params.app_path}/pan1c-env.sif quast.py \
Alexis Mergez's avatar
Alexis Mergez committed
            -t {threads} \
            -o "$(dirname {output.report})" \
            -r {input.ref} \
            --plots-format png \
            --no-read-stats \
            --no-icarus \
            -s \
            --large \
Alexis Mergez's avatar
Alexis Mergez committed
            {input.fas} 2>&1 | \
            tee {log.cmd}
Alexis Mergez's avatar
Alexis Mergez committed
        # Compressing temporary files
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} $(dirname {output.report})/contigs_reports/*.fa
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/*
Alexis Mergez's avatar
Alexis Mergez committed
        mv $(dirname {output.report})/report.html {output.report}
        """
Alexis Mergez's avatar
Alexis Mergez committed
rule contig_position:
    # Produce figures with contig positions
    input:
        fa="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz",
        fai="data/chrInputs/"+config["name"]+".{chromosome}.fa.gz.fai"
    output:
        fig="output/chr.contig/{chromosome}.contig.png",
        outdir=directory("output/chr.contig/{chromosome}")
    threads: 1
    params:
        app_path=config["app.path"]
    shell:
        """
        mkdir {output.outdir}
        base="{output.outdir}"
        hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/')
Alexis Mergez's avatar
Alexis Mergez committed
        
Alexis Mergez's avatar
Alexis Mergez committed
        # Creating .bands file
        apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \
            -s 5 \
            -f {input.fa} \
            > $base/{wildcards.chromosome}.gff
Alexis Mergez's avatar
Alexis Mergez committed
        #cat $base/{wildcards.chromosome}.gff
Alexis Mergez's avatar
Alexis Mergez committed
        awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \
            sed '1ichr\\tstart\\tend\\tname\\tgieStain' \
            > $base/{wildcards.chromosome}.bands
Alexis Mergez's avatar
Alexis Mergez committed
        #cat $base/{wildcards.chromosome}.bands
Alexis Mergez's avatar
Alexis Mergez committed
        # Creating bed like file
        cat {input.fai} | \
            cut -f1,2 | tr '\\t' ',' | \
            sed "s/,/,1,/" | \
            sed "1i chr\\tstart\\tend" | \
            tr ',' '\\t' \
            > $base/{wildcards.chromosome}.tsv
Alexis Mergez's avatar
Alexis Mergez committed
        #cat $base/{wildcards.chromosome}.tsv
Alexis Mergez's avatar
Alexis Mergez committed
        # Creating figure
        apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \
            -b $base/{wildcards.chromosome}.bands \
            -t $base/{wildcards.chromosome}.tsv \
            -o {output.fig}
        """
Alexis Mergez's avatar
Alexis Mergez committed
rule chromosome_clustering:
    # Read ragtagged fastas and split chromosome sequences into according FASTA files
Alexis Mergez's avatar
Alexis Mergez committed
    input:
        expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
    output:
Alexis Mergez's avatar
Alexis Mergez committed
        temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST))
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config["app.path"],
        pan_name=config["name"]
    shell:
        """
        mkdir -p $(dirname {output[0]})
        apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \
            --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name}
        """
Alexis Mergez's avatar
Alexis Mergez committed
rule SyRI_on_ASM:
    # Run SyRI on a single assembly. 
    # The assembly is mapped on the 'reference' and SyRI search for SV.
Alexis Mergez's avatar
Alexis Mergez committed
    input:
        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
        qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
    output:
        fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
        wrkdir=directory('data/asm.syri/{haplotype}')
Alexis Mergez's avatar
Alexis Mergez committed
    log: 
        cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log",
        time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log"
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 4
    retries: 1
    resources:
        mem_gb=48
    params:
        app_path=config["app.path"]
    shell:
        """
        mkdir -p {output.wrkdir}
Alexis Mergez's avatar
Alexis Mergez committed
        /usr/bin/time -v -o {log.time} \
            bash scripts/getSyriFigs.sh \
Alexis Mergez's avatar
Alexis Mergez committed
            -a {params.app_path} \
            -t {threads} \
            -d {output.wrkdir} \
            -o $(basename {output.fig}) \
            -r {input.ref} \
Alexis Mergez's avatar
Alexis Mergez committed
            -q "{input.qry}" 2>&1 | \
            tee {log.cmd}
        
Alexis Mergez's avatar
Alexis Mergez committed
        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
        """
Alexis Mergez's avatar
Alexis Mergez committed
"""
Core section : Running PGGB
"""

Alexis Mergez's avatar
Alexis Mergez committed
rule SyRI_on_chrInput:
    # WIP
Alexis Mergez's avatar
Alexis Mergez committed
    input:
        fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz'
    output:
        fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png",
Alexis Mergez's avatar
Alexis Mergez committed
        wrkdir=directory('data/chrInputs/syri/{chromosome}')
Alexis Mergez's avatar
Alexis Mergez committed
    threads: workflow.cores
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config["app.path"],
        ref=config['reference']
    shell:
        """
        mkdir {output.wrkdir}
Alexis Mergez's avatar
Alexis Mergez committed
        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2)
Alexis Mergez's avatar
Alexis Mergez committed

        ## Creating single fasta from multifasta
        zcat {input.fasta} | awk -F"#" \
Alexis Mergez's avatar
Alexis Mergez committed
            '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}'
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} {output.wrkdir}/*.fa
Alexis Mergez's avatar
Alexis Mergez committed
        #zgrep '>' {output.wrkdir}/*.fa.gz
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
        ## Getting the list of sequences
Alexis Mergez's avatar
Alexis Mergez committed
        AllAsmList=()
Alexis Mergez's avatar
Alexis Mergez committed
        for file in {output.wrkdir}/*.fa.gz; do
Alexis Mergez's avatar
Alexis Mergez committed
            asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz"
Alexis Mergez's avatar
Alexis Mergez committed
            mv $file "$(dirname $file)/$asm"
Alexis Mergez's avatar
Alexis Mergez committed
            AllAsmList+=("$(dirname $file)/$asm")
Alexis Mergez's avatar
Alexis Mergez committed
        done
Alexis Mergez's avatar
Alexis Mergez committed
        
        echo "The ASM Array : ${{AllAsmList[@]}}"
Alexis Mergez's avatar
Alexis Mergez committed

        bash scripts/getSyriFigs.sh \
            -a {params.app_path} \
            -t {threads} \
            -d {output.wrkdir} \
            -o $(basename {output.fig}) \
Alexis Mergez's avatar
Alexis Mergez committed
            -r {output.wrkdir}/"${{refname}}.fa.gz" \
Alexis Mergez's avatar
Alexis Mergez committed
            -q "${{AllAsmList[*]}}" \
            -h 9 -w 16
Alexis Mergez's avatar
Alexis Mergez committed
        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
        rm {output.wrkdir}/*.fa
        """

rule wfmash_on_chr:
    # Run wfmash on a specific chromosome input
    input:
        fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
        fai='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz.fai'
    output:
        mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"),
        aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf")
    threads: 16
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        segment_length=config['wfmash.segment_length'],
        mapping_id=config['wfmash.mapping_id'],
        wfmash_sec=config['wfmash.secondary']
    log: 
        cmd_map="logs/pggb/{chromosome}.wfmash_mapping.cmd.log",
        time_map="logs/pggb/{chromosome}.wfmash_mapping.time.log",
        cmd_aln="logs/pggb/{chromosome}.wfmash_aln.cmd.log",
        time_aln="logs/pggb/{chromosome}.wfmash_aln.time.log",
    shell:
        """
        ## Mapping
        /usr/bin/time -v -o {log.time_map} \
            apptainer exec {params.app_path}/pggb.sif wfmash \
            -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
            -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
            --tmp-base $(dirname {output.aln}) \
            {input.fa} --approx-map \
            1> {output.mapping} \
            2> >(tee {log.cmd_map} >&2)

        ## Aligning
        /usr/bin/time -v -o {log.time_aln} \
            apptainer exec {params.app_path}/pggb.sif wfmash \
            -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \
            -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \
            --tmp-base $(dirname {output.aln}) \
            {input.fa} -i {output.mapping} \
            --invert-filtering \
            1> {output.aln} \
            2> >(tee {log.cmd_aln} >&2)

        # Compressing
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} -k {output.mapping}

        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} -k {output.aln}
        """

rule seqwish:
    # Run seqwish on alignement produced by wfmash
    input:
        fa='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz',
        aln=rules.wfmash_on_chr.output.aln
    output:
        temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa")
    threads: 8
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        seqwish=config['seqwish.params']
    log: 
        cmd="logs/pggb/{chromosome}.seqwish.cmd.log",
        time="logs/pggb/{chromosome}.seqwish.time.log"
    shell:
        """
        /usr/bin/time -v -o {log.time} \
            apptainer exec {params.app_path}/pggb.sif seqwish \
            -s {input.fa} -p {input.aln} -g {output} \
            {params.seqwish} -t {threads} \
            --temp-dir $(dirname {output}) -P 2>&1 | \
            tee {log.cmd}

        # Compressing
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} -k {output}
        """

rule gfaffix_on_chr:
    # Run gfaffix on seqwish graph
    input:
        rules.seqwish.output
    output:
        gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"),
        transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt"
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path']
    log: 
        time="logs/pggb/{chromosome}.gfaffix.time.log"
    shell:
        """
        /usr/bin/time -v -o {log.time} \
            apptainer exec {params.app_path}/pggb.sif gfaffix \
            {input} -o {output.gfa} -t {output.transform} \
            > /dev/null

        # Compressing
        apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
            -@ {threads} -k {output.gfa}
        """

rule odgi_postprocessing:
    # Running pggb's postprocessing (mainly odgi) steps with gfaffix graph
    input:
        rules.gfaffix_on_chr.output.gfa
    output:
        gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
    threads: 8
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path']
    log: 
        cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log",
        time_build="logs/pggb/{chromosome}.odgi_build.time.log",
        cmd_unchop="logs/pggb/{chromosome}.odgi_unchop.cmd.log",
        time_unchop="logs/pggb/{chromosome}.odgi_unchop.time.log",
        cmd_sort="logs/pggb/{chromosome}.odgi_sort.cmd.log",
        time_sort="logs/pggb/{chromosome}.odgi_sort.time.log",
        cmd_view="logs/pggb/{chromosome}.odgi_view.cmd.log",
        time_view="logs/pggb/{chromosome}.odgi_view.time.log"
    shell:
        """
        OGfile="$(dirname {input})/$(basename {input} .gfa)"

        ## Converting to odgi format and optimizing namespace
        /usr/bin/time -v -o {log.time_build} \
            apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \
            tee {log.cmd_build}
Alexis Mergez's avatar
Alexis Mergez committed
        ## Unchoping the nodes (merges unitigs into single nodes)
        /usr/bin/time -v -o {log.time_unchop} \
            apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \
            tee {log.cmd_unchop}

        ## Sorting the nodes
        /usr/bin/time -v -o {log.time_sort} \
            apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \
            -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \
            tee {log.cmd_sort}

        ## Exporting to gfa
        /usr/bin/time -v -o {log.time_view} \
            apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            view -i $OGfile.unchoped.sorted.og -g \
            1> {output.gfa} \
            2> >(tee {log.cmd_view} >&2) 

        ## Removing .og files for space savings
        rm $(dirname {input})/*.og

        ## Adding metadata
        python scripts/getTags.py \
            --appdir {params.app_path} --config-file config.yaml \
            > "$(dirname {input})/metadata.txt"
        sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa}

        # Compressing
        # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \
        #     -@ {threads} -k {output.gfa}
        """

rule generate_graph_list:
    # Generate a text file containing all created graphs
    input:
        expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
    output:
        "data/chrGraphs/graphsList.txt"
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    run:
        with open(output[0], "w") as handle:
            for file in input:
                handle.write(file+"\n")

rule graph_squeeze:
    # Using odgi to merge every subgraphs into a final one
    input:
        glist="data/chrGraphs/graphsList.txt",
        graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST)
    output:
        "output/pan1c."+config['name']+".gfa"
Alexis Mergez's avatar
Alexis Mergez committed
    log: 
        cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log",
        time="logs/squeeze/"+config['name']+".squeeze.time.log",
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 16
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path']
    shell:
        """
Alexis Mergez's avatar
Alexis Mergez committed
        /usr/bin/time -v -o {log.time} \
            apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            squeeze -t {threads} -O -P -f {input.glist} -o {output}.og 2>&1 | \
            tee {log.cmd}

Alexis Mergez's avatar
Alexis Mergez committed
        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            view -t {threads} -P -i {output}.og -g > {output}
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
        rm {output}.og
        """

rule graph_stats:
    # Using GFAstats to produce stats on every chromosome graphs
    input:
        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
    output:
        genstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv",
        pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv"
    threads: 4
    params:
        app_path=config['app.path'],
        pan_name=config['name']
    shell:
        """
        apptainer run --app gfastats {params.app_path}/PanGeTools.sif \
            -g {input.graph} -P \
            -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \
            -t {threads}
        """

rule graph_figs:
    # Creating figures using odgi viz 
    input:
        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
    output:
        oneDviz="output/chrGraphs.figs/"+config['name']+".{chromosome}.1Dviz.png",
        pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png"
    threads: 4
    params:
        app_path=config['app.path'],
        oneDviz=config['odgi.1Dviz.params'],
        pcov=config['odgi.pcov.params']
    shell:
        """
        ## 1D viz
        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P

        ## Pcov viz
        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P
        """

rule aggregate_graphs_stats:
    # Reading and merging all stats files from chromosome graphs into a .tsv.
    input:
        genstats=expand("output/stats/chrGraphs/"+config['name']+".{chromosome}.general.stats.tsv", chromosome=CHRLIST)
    output:
        genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv",
        pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        pan_name=config['name']
    shell:
        """
        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \
            --input $(dirname {input[0]}) --outputGeneral {output.genstats} \
            --outputPaths {output.pathstats} --panname {params.pan_name} 
        """

rule final_graph_tagging:
    # Add metadata to the final GFA
    input:
        graph="output/pan1c."+config['name']+".gfa",
    output:
        "output/pan1c."+config['name']+".gfa.metadata"
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    priority: 100
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        pan_name=config['name']
    shell:
        """
        python scripts/getTags.py \
            --appdir {params.app_path} --config-file config.yaml > {output}
        sed -i '/^H/r {output}' {input.graph}
        """

rule pggb_input_stats:
    # Produces statistics on pggb input sequences
    input:
        flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
    output:
        "output/stats/pan1c."+config['name']+".chrInput.stats.tsv"
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        pan_name=config['name']
    shell:
        """
        apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \
            -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name}
        """

rule core_statistics:
    # Aggregate chrInput, chrGraph and pggb statistics into a single tsv 
    input:
Alexis Mergez's avatar
Alexis Mergez committed
        chrInputStats = "output/stats/pan1c."+config['name']+".chrInput.stats.tsv",
        chrGraphStats = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
Alexis Mergez's avatar
Alexis Mergez committed
    output:
Alexis Mergez's avatar
Alexis Mergez committed
        tsv = "output/stats/pan1c."+config['name']+".core.stats.tsv",
        dir = directory("output/pggb.usage.figs")
    threads: 1
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        pan_name=config['name']
    shell:
        """
        mkdir -p {output.dir}
        apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \
            --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \
            --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name}
        """

"""
Alexis Mergez's avatar
Alexis Mergez committed
Post-processing section
Alexis Mergez's avatar
Alexis Mergez committed
"""
rule get_pav:
    # Create PAV matrix readable by panache for a given chromosome scale graph
    input:
        "data/chrGraphs/graphsList.txt"
    output:
        directory("output/pav.matrices")
    threads: 16
    params:
        app_path=config['app.path']
    run:
        shell("mkdir {output}")
        # Getting the list of graphs
        with open(input[0]) as handle:
            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
        # Iterating over graphs
        for graph in graphList:
            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}")

rule panacus_stats:
    # Produces panacus reports for a chromosome graph
    input:
        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa'
    output:
        html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html'
Alexis Mergez's avatar
Alexis Mergez committed
    log: 
        cmd="logs/panacus/{chromosome}.panacus.cmd.log",
        time="logs/panacus/{chromosome}.panacus.time.log"
Alexis Mergez's avatar
Alexis Mergez committed
    params:
        app_path=config['app.path'],
        pan_name=config['name'],
        refname=config['reference']
    threads: 8
    shell:
        """
Alexis Mergez's avatar
Alexis Mergez committed
        /usr/bin/time -v -o {log.time} \
            bash scripts/getPanacusHG.sh \
Alexis Mergez's avatar
Alexis Mergez committed
            -g {input.graph} \
            -r $(basename {params.refname} .fa.gz) \
            -d data/chrGraphs/{wildcards.chromosome} \
            -o {output.html} \
            -a {params.app_path} \
Alexis Mergez's avatar
Alexis Mergez committed
            -t {threads} 2>&1 | \
            tee {log.cmd}
Alexis Mergez's avatar
Alexis Mergez committed
        """

rule create_pan1c_report_fig:
    # Produces a markdown report figure of chromosomes graphs
    input:
        graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa',
        contigfig="output/chr.contig/{chromosome}.contig.png",
    output:
        odgifig=temp("tmp/{chromosome}.odgi.png"),
        namefig=temp("tmp/{chromosome}.name.png"),
        reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png"
    threads: 4
    params:
        app_path=config['app.path']
    shell:
        """
        ## Odgi 1D viz
        apptainer run --app odgi {params.app_path}/PanGeTools.sif \
            viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P

        ## Getting legend from contig figure
        convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig}

        odgheight=$(identify -ping -format '%h' {output.odgifig})
        ctgheight=$(identify -ping -format '%h' {input.contigfig})

        combinedheight=$(($odgheight+$ctgheight))

        echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight"

        ## Creating empty canvas ad adding other figures to it
        convert -size "3300x$combinedheight" xc:white {output.reportfig}
        composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig}
        composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig}
        composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig}
        """

Alexis Mergez's avatar
Alexis Mergez committed
def get_report_sections(wildcards):
Alexis Mergez's avatar
Alexis Mergez committed
    """
    Return 'create_pan1c_report' optional inputs to add them to the final report.
    For example :
        - SyRI figures on Assemblies
    """
    sections = dict()

    sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata"
Alexis Mergez's avatar
Alexis Mergez committed
    sections["odgifigs"] = expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST)
    sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv"
    sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv"
Alexis Mergez's avatar
Alexis Mergez committed

    if config["get_ASMs_SyRI"] == "True":
        sections["SyRI_on_ASMs_figs"] = expand(
            "output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", 
            haplotype=SAMPLES_NOREF
            )

Alexis Mergez's avatar
Alexis Mergez committed
    return sections      
Alexis Mergez's avatar
Alexis Mergez committed

Alexis Mergez's avatar
Alexis Mergez committed
rule create_pan1c_report:
    # Produces a markdown report of chromosomes graphs
    input:
Alexis Mergez's avatar
Alexis Mergez committed
        unpack(get_report_sections)
Alexis Mergez's avatar
Alexis Mergez committed
    output:
Alexis Mergez's avatar
Alexis Mergez committed
        report="output/pan1c."+config['name']+".report.md"
Alexis Mergez's avatar
Alexis Mergez committed
    threads: 4
    params:
Alexis Mergez's avatar
Alexis Mergez committed
        app_path=config['app.path'],
        add_SyRI=config['get_ASMs_SyRI']
Alexis Mergez's avatar
Alexis Mergez committed
    run:
Alexis Mergez's avatar
Alexis Mergez committed
        shell("touch {output.report}")

        # Adding Summary (made for importing in Joplin)
        shell("echo '# Summary' >> {output.report}")
        shell("echo '[TOC]' >> {output.report}")
        shell("echo '' >> {output.report}")
        
        # Adding graph construction info
        ## WIP
Alexis Mergez's avatar
Alexis Mergez committed
        # Adding metadata
        shell("echo '# Graph metadata' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '```' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed
        shell("cat {input.metadata} >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '```' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed

        # Adding General stats
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '# General stats' >> {output.report}")
        shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.report}")
        shell("echo '' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed

        # Adding Path stats
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '# Path stats' >> {output.report}")
        shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.report}")
        shell("echo '' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed

        # Adding chromosomes figures
Alexis Mergez's avatar
Alexis Mergez committed
        fig_list = [fig for fig in input.odgifigs]
Alexis Mergez's avatar
Alexis Mergez committed
        print(fig_list)
        fig_list.sort()
Alexis Mergez's avatar
Alexis Mergez committed
        shell("echo '# Chromosome-scale odgi graphs' >> {output.report}")
Alexis Mergez's avatar
Alexis Mergez committed
        for fig in fig_list:
            basename=os.path.basename(fig)
            chr_name=basename.split('.')[1]
Alexis Mergez's avatar
Alexis Mergez committed
            shell("echo '## {chr_name}' >> {output.report}")
            shell("echo '![{basename}](./report/{basename})' >> {output.report}")
        shell("echo '' >> {output.report}")

Alexis Mergez's avatar
Alexis Mergez committed
        # Adding SyRI figs if produced
        if params.add_SyRI:

            fig_list = [fig for fig in input.SyRI_on_ASMs_figs]
            fig_list.sort()

            shell("echo '# SyRI on input assemblies' >> {output.report}")
            for fig in fig_list:
                basename = os.path.basename(fig)
                hap_name = basename.split('.')[2:4]

                shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}")
                shell("echo '![{basename}](./asm.syri.figs/{basename})' >> {output.report}")

            shell("echo '' >> {output.report}")