# Metagenome Orchestra V2.2a configuration file template # # Modification of this document: 2019-08-12 # # # # ================================================= # Mago web site: http://mago.fe.uni-lj.si # ================================================= # # Metagenome Orchestra (magoOrchestra, Orchestra or Mago) # is distributed under the # Creative Commons Attribution CC BY license # https://creativecommons.org/licenses/ # # # # ------------------------------------------------------ # If you use Orchestra or its derivatives, please cite: # ...submitted for publication, please check back to # Orchestra web site for reference update # # Please also cite software that Orchestra contains, # and that you use for your pipeline processing. # Please see credits.txt, at Orchestra web # site for details about the included software. # ------------------------------------------------------ # # # # IMPORTANT: Orchestra is developed and disseminated in a good # faith and desire to work according to expectations, but authors # DO NOT give any guarantees about it correctness. # # USE IT AT YOUR OWN RISK. # # Authors cannot be held legally or morally responsible for any # consequences that may arise from using or misusing Orchestra. # # IMPORTANT: Orchestra is a skeleton application for a synergic # execution of many externally developed pieces of software. # These are disseminated as integrated parts of Orchestra to # provide user-friendly out-of-the-box experience. # Nonetheless, every included piece of software remains OWNED # and COPYRIGHTED by its respective developers. # Please see document credits.txt, at Orchestra web site. # # ================================================= # Mago web site: http://mago.fe.uni-lj.si # ================================================= # # # # Metagenome Orchestra is developed by: # # Blaz Stres (blaz.stres@gmail.com) # University of Ljubljana # Biotechnical Faculty, # Faculty of Medicine, # Faculty of Geodetic and Civil Engineering # # University of Innsbruck (blaz.stres@uibk.ac.at) # Department of Microbiology # # # # Bostjan Murovec (bostjan.murovec@fe.uni-lj.si) # University of Ljubljana # Faculty of electrical engineering # # Please contact us with comments, suggestions for # improvements or descriptions of discovered bugs. # Your feedback is most welcome. #-------------------------------------------------------------------------- # Usage: #-------------------------------------------------------------------------- # # 1. Fill in parameters below according to your preferences. # # A. This document is fairly long because it contains # complete instructions about setting Orchestra parameters. # For production, a user is expected to remove the majority of # comments out of this file to make it easier to navigate. # # B. Section titles ([Global], [FastQC], etc.) must appear # in a file prior to parameters that belong to them. # Only sections where a user actually specifies some # parameters need to be present. # # C. Many parameters do not need to be set. In many cases default # values suffice. Please do not be overwhelmed by the number of # available settings below. Generally, to start using Orchestra # it suffices to specify input files, output directory and # components to be run as part of the processing. # # # # 2. Start Orchestra by following container specific instructions # in separate documents. # # A. For Singularity image: # # http://mago.fe.uni-lj.si/singularity_instructions_V2_2.txt # # A brief help is also available through command: # singularity help /path/to/mago.simg # # # # B. For Docker container: # # http://mago.fe.uni-lj.si/docker_instructions_V2_2.txt # # # # C. For Oracle VirtualBox virtual machine: # # http://mago.fe.uni-lj.si/virtualbox_instructions_V2_2.txt # # # # 3. After Orchestra is run its results are going to be located # within a specified output directory, and organized in # subdirectories. There is subdirectory 02_report within the # output directory, which contains exact command line as well # as captured STDOUT and STDERR outputs of each executed # external program for post-mortem inspection of execution # progress, if such a need arises. #-------------------------------------------------------------------------- # General information about configuration files: #-------------------------------------------------------------------------- # Lines that begin with a dash (#) are ignored. # Parameters, which are supposed to be unspecified, # may be commented out with a dash (#) instead of # being deleted (for documentation purposes). # Empty lines and lines that contain only spaces and # tab characters are ignored as well. # NOTE: absolute paths are advised when specifying # file and directory names. # NOTE: Home directory cannot be abbreviated as '~'. # Instead, please use e.g. /home/user_name/some_dir # During its running, Orchestra checks for the # presence of intermediate results of its potential # previous runs. If these are available, they are # reused in order to speedup execution. # # This is useful for experimenting with different # parameters. If a user changes only parameters for e.g. # CheckM taxonomy workflow, then Orchestra re-executes # only this part, AND other parts that may be affected # by the changes. Steps that are not affected by the # introduced parameter change, are skipped, which saves # an appreciated amount of execution time. # # In order for this to work, please specify the same # output directory (below parameter out_directory_root) # upon all consecutive runs that logically belong # together. Please, do NOT modify any files within this # directory by yourself. # # NOTE 1: if Orchestra crashes during its operation or # if it is terminated prematurely by an operator, the # left-over results may be corrupted. During its NEXT run, # Orchestra tries to automatically recover from such # situations by re-running steps with corrupted and # incomplete results. It also re-runs all additional steps # that may be affected by these very results. Consequently, # Orchestra implements a full crash recovery mechanism. # If a crash or premature termination occurs, the only # intervention that is required from an operator is to # re-run Orchestra. The actual configuration file for this # very re-run can and SHOULD stay the same. An operator # does not need and should NOT suggest Orchestra which # steps should be re-executed in such cases. # # NOTE 2: please do not manually delete intermediate results. # Orchestra determines steps to be re-executed by inspecting # its own status files, and it has almost no ability to # gracefully recover from manually deleted intermediate files. # # NOTE 3: Orchestra does not re-execute a certain step of its # pipeline processing, if it believes that the associated results # are up to date. If for some reason you want to force a # re-execution of such a step, delete an ENTIRE subdirectory # that is associated with the step. Orchestra is deliberately # designed to handle gracefully deletions of entire subdirectories, # since this provides an easy way for an operator to force # a re-execution, if a need arises. For example, if Orchestra # believes that the results of the Concoct binner are up to date, # but you want to re-execute it anyway, then simply delete # the ENTIRE subdirectory that is associated with Concoct. # # NOTE 4: A more fine-grained forcing of re-execution is possible. # Within an output directory, there exists directory 01_status. # Within it there is a status file for each already run step, # whether or not it completed its execution successfully. # It is safe to delete any or several of these files, by means # of which Orchestra considers the associated steps as not # executed yet. Consequently, it will attempt to execute them # on the next re-run of the pipeline. # Association of these files with processing steps should # be self evident from their descriptive file names. # # NOTE 5: Please perform above suggested active interventions # in output directory only when Orchestra is NOT being executed. # # NOTE 6: if you change parameters for execution of a certain # step, then Orchestra will detect changes and consider the # available results outdated. In this case a re-run will happen # automatically, and there is no need for an operator to # intervene and delete subdirectories and/or status files. # # ------------------------------------------------------------ # IMPORTANT IMPORTANT IMPORTANT IMPORTANT # ------------------------------------------------------------ # NOTE 7: Orchestra assumes that a re-run with an EXISTING # output directory is done with the SAME input files # (input reads and potential externally provided scaffold) # as in its previous run. It is assumed that only parameters # of execution are changed (or previous run was interrupted). # If your intention is to process different input data, # then set an output directory to a new NON-existing # directory. Failing to do so, will corrupt the already # generated files in a given output directory, and # produce gibberish results. # # ------------------------------------------------------------ # IMPORTANT IMPORTANT IMPORTANT IMPORTANT # ------------------------------------------------------------ # NOTE 8: when you attempt to re-run some steps with # changed parameters, do NOT disable the already completed # steps in an attempt to save time. If you leave these # settings intact, Orchestra will figure out by itself that # the resulting files are already up to date, and it will NOT # re-execute unnecessary steps. However, if you disable # an already completed step, then Orchestra will assume that # your intention is to execute a different workflow than # in the previous run, and it will modify its behavior # accordingly, which is not an intended behavior. # # As an example, let us suppose that CheckM coverage # file was generated in a previous Orchestra run. # This file was then inputted to CheckM taxonomy workflow. # In the second Orchestra run, you intend to change # some parameters for CheckM taxonomy workflow. # Hence, this workflow needs to be re-executed, but # CheckM coverage file does not need to be re-generated. # The proper way to handle this situation is to leave # generation of CheckM coverage file ENABLED. # Orchestra will determine by itself that the coverage # file is already generated and up to date, so it will # not waste the time to re-generate it once more. The # already present coverage file will be inputted to the # CheckM taxonomy workflow, which will be re-executed # (as expected due to the changes of its parameters). # On the other hand, if generation of CheckM coverage file # had been disabled in a MISLEADING attempt to save some # execution time, then Orchestra would assume that CheckM # taxonomy workflow is supposed to be executed without # CheckM coverage file as one of its inputs. # # In conclusion, do NOT try to suggest Orchestra, # which pipeline steps need to be re-executed. Instead, # on each run, just indicate, how Orchestra should proceed, # like it is its first execution time. # The rest is (hopefully) taken care of automatically. #************************************************************ # global parameters that affect execution # of the entire Orchestra. #************************************************************ [Global] # OPTIONAL: memory_limit_type (default: None) # # Upper bound limit on a memory that Orchestra # is allowed to consume during its execution. # # It is a good idea to limit memory consumption # since many programs that are included in # Orchestra may exhaust the available resources # fairly quickly, and destabilize the underlying # operating system by causing excessive swapping # of work memory to a disk. # # The available memory-limiting options are: # # None: no limitations; this may be a good # option, if you run Orchestra on your # own computer and you want to maximize # the chance that Orchestra completes # its processing. On the other hand, # if Orchestra is run on a High # Performance Computing (HPC) facilities, # then some memory limiting is a # recommended choice, since excessive # swapping may impair execution of other # programs (from other users) on the # same machine. # # Free: set memory limit to the amount of free # memory as determined at the beginning # of an Orchestra run. # # Manual: manually specify memory limit in GigaBytes # with parameter manual_memory_limit (below). # # If some processing step requires more memory than # the specified limit allows, then such step will # be terminated abruptly, whereas its results will # be corrupted. Orchestra handles such situations # gracefully by avoiding to execute any further # processing steps that rely on these very results. #memory_limit_type = None #memory_limit_type = Free #memory_limit_type = Manual # REQUIRED IF PARAMETER memory_limit IS SET TO Manual: # manual_memory_limit_GB # # Manual specification of memory limit in GigaBytes. # It is relevant only when parameter memory_limit # is set to Manual. Floating point values are allowed, # so it is possible to specify memory with a finer # granulation than GigaBytes unit. #manual_memory_limit_GB = 50 # 50 GigaBytes #manual_memory_limit_GB = 20.3 # 20 GigaBytes + 300 MegaBytes # REQUIRED: out_directory_root # # Specification of an output directory. # # This directory is created during Orchestra execution # to store pipeline results. Alternatively, results # of previous executions may already be present in # a specified output directory, by means of which, # Orchestra avoids re-running of certain steps. # # ---------------------------------------------------- # This is NOT a directory with input files. # # Input files may be and SHOULD be located at # arbitrary different locations. # ---------------------------------------------------- # # Example: by making the following setting: # # out_directory_root = /home/johnDoe/Analysis # # it is expected that directory /home/johnDoe/Analysis # does NOT already exist prior to Orchestra execution, # or that it was created by Orchestra itself during # some of its previous runs. # On the other hand, the parent directory # /home/johnDoe should already exist in any case. #out_directory_root = /home/user/some_out_dir # BOTH REQUIRED: input_R1_reads_file # input_R2_reads_file # # File names of input R1 and R2 reads in fastq format. # # A. One possibility is to specify file names of one R1 and one R2 file. # # B. Several R1 and R2 files may be targeted by standard Linux file # patterns like "sequences_R1_*.fastq" and "sequences_R2_*.fastq". # # C. It is NOT possible to enumerate several R1 and/or R2 files by # specifying these two parameters more than once. # # # # NOTE 1: Each individual file pattern must target only R1 or R2 reads, # and there must be no mix-up between the two types of files. # If in doubt, please open Linux terminal in directory with R1 # and/or R2 files, and execute something like "ls some_R1_*.fastq" # and "ls some_R2_*.fastq", to check the list of files that # are targeted with a specified file pattern (some_R1_*.fastq # and some_R2_*.fastq in this example). # # NOTE 2: All R1 files to be matched with a specified file pattern # must be located in the same directory. Likewise for R2 files. # Both sets of files may be (and typically are) located in the # same directory, but it is also possible that R1 files are located # in a different directory, or even on a different disk/partition # than R2 files. # # NOTE 3: Each R1 file, which is targeted with R1 file pattern, must be # matched with the corresponding R2 file of the R2 file pattern. # File names of the corresponding files must differ only in R1 and # R2 name fragment. # E.g. Orchestra will match file "some_file_R1_00037.fastq" with # "some_file_R2_00037.fastq". Any mismatches of the R1 and R2 file # names will produce gibberish results. # # # # TIP: If your directory with R1 (and/or) R2 files contain more files # than you intend to process within an isolated Orchestra run, # and if it is impossible to target only the appropriate subset # of them with a file pattern, then it is possible to proceed # as follows. # # SOLUTION 1: create a new directory (say /home/me/special_reads). # Then copy or move the appropriate subset of R1 and # R2 files to a new directory. # This way it is easy to target only these files # with generic file patterns like # /home/me/special_reads/*R1*.fastq and # /home/me/special_reads/*R2*.fastq. # # SOLUTION 2: create a new directory (say /home/me/special_reads). # Within this directory create symbolic links to the # appropriate files. Symbolic links are created with # command "ln -s /path/to/physical_file /path/to/symlink". # Then target all of linked files with generic patterns # /home/me/special_reads/*R1*.fastq and # /home/me/special_reads/*R2*.fastq. # # Solution 1 is probably easier to do, whereas solution 2 has an # advantage that no files are actually being moved around a file # system. This way, the same files may be linked to different # directories and simultaneously take part of different input # combinations without occupying disk space more than once. #input_R1_reads_file = /home/user/input_dir/some_name_R1.fastq #input_R2_reads_file = /home/user/input_dir/some_name_R2.fastq #input_R1_reads_file = /home/user/input_dir/some_pattern_*R1*.fastq #input_R2_reads_file = /home/user/input_dir/some_pattern_*R2*.fastq #input_R1_reads_file = /home/user/separate_dir_R1/some_pattern_*R1*.fastq #input_R2_reads_file = /home/user/separate_dir_R2/some_pattern_*R2*.fastq # OPTIONAL: input_scaffold_file_1 # input_scaffold_file_2 # input_scaffold_file_3 # ... # input_scaffold_file_n # # File names of an arbitrary number of externally provided # input scaffold files in fasta format. # # The appendices "_1", "_2", "_3", etc. enumerate these # scaffolds, which are bookkept internally by Orchestra # by the respective tags "external_1", "external_2", # "external_3", etc. # # External scaffold files may be provided instead of or in # addition to the scaffolds that are built with the internal # assemblers (see below). # # It is valid to provide a scaffold file that was built with # an UNRELATED Orchestra run. On the other hand, if you merely # repeat the Orchestra processing with the same input files # and with the same target output directory, then Orchestra # will reuse the already present scaffolds, and for that # matter please do not specify these scaffolds as external. # Use parameters input_scaffold_file_n only for scaffolds # that are built with separate and independent Orchestra # runs. Consequently, these external scaffolds should be # located outside the current output directory. # # Orchestra attempts to enumerate as many parameters of # the form "input_scaffold_file_x" as they are specified. # However, their enumeration must be consecutive. Orchestra # stops enumerating external scaffolds, as soon as # certain external scaffold parameter within the sequence # does not exist. # # For example, by specifying the following parameters: # # input_scaffold_file_1 = /some_dir/some_scaffold.fasta # input_scaffold_file_2 = /another_dir/another_scaffold.fasta # input_scaffold_file_4 = /still_some_dir/still_another_scaffold.fasta # input_scaffold_file_5 = /distant_dir/excelent_scaffold.fasta # input_scaffold_file_6 = /near_dir/suspicious_scaffold.fasta # # only external scaffolds some_scaffold.fasta (_1) and # another_scaffold.fasta (_2) will be processed. Orchestra # will try to read parameter input_scaffold_file_3, which # does not exist, so it will stop its attempt to enumerate # further external scaffolds. Hence, it is vital that input # scaffolds are specified as a consecutive sequence of # integers, starting at 1. # # Orchestra recognizes literal "None" in the place of # an external scaffold file name. Such entry instructs # Orchestra to ignore the parameter in question but to # continue enumerating further external scaffolds. # # For example, the above example may be augmented in # the following manner: # # input_scaffold_file_1 = /some_dir/some_scaffold.fasta # input_scaffold_file_2 = /another_dir/another_scaffold.fasta # input_scaffold_file_3 = None # input_scaffold_file_4 = /still_some_dir/still_another_scaffold.fasta # input_scaffold_file_5 = /distant_dir/excelent_scaffold.fasta # input_scaffold_file_6 = /near_dir/suspicious_scaffold.fasta # # This time, Orchestra enumerates all 5 specified scaffolds and # tags them according to their specification. E.g. # suspicious_scaffold.fasta is tagged internally as external_6. # # There can be as many "None" specifications as desirable. # The following example is fully valid. # # input_scaffold_file_1 = /some_dir/some_scaffold.fasta # input_scaffold_file_2 = /another_dir/another_scaffold.fasta # input_scaffold_file_3 = None # input_scaffold_file_4 = None # input_scaffold_file_5 = None # input_scaffold_file_6 = None # input_scaffold_file_7 = None # input_scaffold_file_8 = /still_some_dir/still_another_scaffold.fasta # input_scaffold_file_9 = None # input_scaffold_file_10 = None # input_scaffold_file_11 = /distant_dir/excelent_scaffold.fasta # input_scaffold_file_12 = None # input_scaffold_file_13 = None # input_scaffold_file_14 = None # input_scaffold_file_15 = None # input_scaffold_file_16 = None # input_scaffold_file_17 = /near_dir/suspicious_scaffold.fasta # input_scaffold_file_18 = None # input_scaffold_file_19 = None # input_scaffold_file_20 = None # # # # IMPORTANT: The "None" specification is provided in order to easily # exclude some scaffolds from repeated analyses, whereas at the same # time the tags of the remaining scaffolds are preserved. # THIS POINT IS CRUCIAL. # # For example, let us suppose that an initial Orchestra analysis # is done with the following three external scaffolds. # # input_scaffold_file_1 = /some_dir/some_scaffold.fasta # input_scaffold_file_2 = /another_dir/another_scaffold.fasta # input_scaffold_file_3 = /still_some_dir/still_another_scaffold.fasta # # Upon an Orchestra re-run, it is decided that external scaffold 2 # should be excluded from the analysis. The WRONG approach is to # re-enumerate the third scaffold into the second one. # # WRONG: input_scaffold_file_1 = /some_dir/some_scaffold.fasta # WRONG: input_scaffold_file_2 = /still_some_dir/still_another_scaffold.fasta # # Orchestra preserves all data about previously consumed scaffolds # (their indexing, generation of SAM, BAM, indexed bam, abundances, # constellated bins, ...). By renaming and consequently re-tagging # a scaffold, Orchestra is forced to delete all this information, # and regenerate it once more. This is a huge waste of time. # # Instead, care should be taken that external scaffolds from RELATED # previous Orchestra runs (which target the same output directory) # preserve their tags. The above WRONG situation should be remedied # in the following CORRECT way. # # CORRECT: # input_scaffold_file_1 = /some_dir/some_scaffold.fasta # input_scaffold_file_2 = None # input_scaffold_file_3 = /still_some_dir/still_another_scaffold.fasta # If no external scaffold is provided, then at least one built-in # assembler needs to be enabled. # # NOTE 1: do not specify scaffold file, which is generated within the # previous Orchestra run (and is, therefore, located within # the output directory). If such scaffold exists, and if its # associated assembler is ENABLED by the appropriate setting # below, then Orchestra will consume the available previous # results WITHOUT re-running the corresponding assembler. # # Of course, you may copy Orchestra-generated scaffold # files to another place and use them as an input to some # other Orchestra run, instead of assembling them # from scratch every time. # # NOTE 2: Orchestra assumes that a re-run with an EXISTING # output directory is done with the SAME input files. # This holds for a scaffold file specified by # input_scaffold_file as well. If you change # scaffold file between different pipeline runs, # Orchestra will not detect the change and it will # reuse the potentially already generated intermediate # files, which are derived based on a previous # version of a scaffold. Consequently, the end results # will be gibberish. # # If your intention is to process the same input # data in conjunction with a different scaffold, # then set output directory to a new NON-existing # directory. # # However, it is fully supported to not define # input_scaffold_file on a consecutive run, and # then enable it again in some future Orchestra # rerun, provided that the specified scaffold # file remains the same. # # When several scaffolds are available for pipeline processing # (for example due to specification of parameter(s) input_scaffold_file_x # together with enabling at least one assembler, and/or by enabling # several assemblers), then Orchestra executes a Cartesian product # of enabled binners with all available scaffolds and enabled SAM # generation methods (as described further on). # # For example, if two scaffolds are available and three SAM generation # methods are enabled, then each enabled binner is run for 2x3=6 times. # # A separate DasTool analysis (if DasTool is enabled) is performed for # each available scaffold. An individual DasTool run takes into accounts # results of all binning steps that are associated with a certain scaffold. # # Please note that by enabling several scaffolds and SAM methods, # execution time of an entire pipeline as well as its disk space # consumption may significantly increase. # # Generally, only one external scaffold or one scaffold-generation # method together with only one SAM-generation method is supposed # to be enabled, unless the intention is to compare relative # performance of these methods. #input_scaffold_file_1 = /home/user/scaffold_dir/some_scaffold.fa # OPTIONAL: delete_intermediate_files (default NO) # # Set to Yes to make Orchestra delete intermediate # files during processing as soon as they are not # needed any more in order to save disk space. # Orchestra deletes only those files that are not # needed for the rest of the current run, as well # as in the future for its potential re-executions. # Therefore, setting this parameter to Yes does # not prevent efficient re-execution of Orchestra # in the same output directory (e.g. to experiment # with different parameters). # In addition, Orchestra only deletes files that # are related to a certain processing step, after # the very step is completed successfully. In the # case of an error, all files that are related # to an erroneous step are left untouched as an # aid for a post-mortem analysis of an error. #delete_intermediate_files = Yes # OPTIONAL: number_of_threads # # Number of threads to use for parallel execution. # # If the parameter is not specified, Orchestra # will try to determine the number of available # processors on a system by querying the underlying # operating system, and it will consume as many of # them as available. # # If your intention is not to consume all available # resources (e.g. because the same hardware will # execute some other calculations in parallel), then # you may manually set the value of this parameter to # a LOWER value than the number of available CPUs. # Setting this number to a larger value than the # numbers of CPUs, will DECREASE computation speed # (but will have no other adverse consequences). # # Another reason for lowering this number below the # actual CPU count is to lower memory consumption. # If an experience shows that Orchestra (in fact, # some of its external programs) consumes too # much memory, sometimes (but not always) the issue # may be alleviated by reducing the number of # threads that execute in parallel. #number_of_threads = 4 #************************************************************ # Parameters for the FastQC Quality Control application # # https://www.bioinformatics.babraham.ac.uk/projects/fastqc/ #************************************************************ [FastQC] # OPTIONAL: include_FastQC (default NO) # # Set to Yes to run FastQC quality control application # on input data. Its results are placed in a separate # subdirectory within the specified output directory # for later inspection. Aside from this, these results # are not used in any way in the later steps of pipeline # processing. In order to filter input sequences based # on their quality, use the FastP program (see below). #include_FastQC = yes # OPTIONAL: params_FastQC (quote delimited) # # Specify any algorithmic parameters that FastQC knows about. # # Example: params_FastQC = "--kmers 4" # # NOTE 1: this option is available for experienced users. # Leave params_FastQC unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with FastQC parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of this parameter is passed directly # to FastQC without any checking or formatting. # It is the responsibility of a user to assure # the correctness of these parameters. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by FastQC itself. # Please consult FastQC documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from FastQC help. # If you have questions regarding these parameters, # please consult documentation or authors of FastQC. # ----------------------------------------------------------- # # --nogroup Disable grouping of bases for reads >50bp. # All reports will show data for every base # in the read. WARNING: Using this option will # cause fastqc to crash and burn if you use it on # really long reads, and your plots may end up a # ridiculous size. You have been warned! # # -c Specifies a non-default file which contains the list of # --contaminants contaminants to screen overrepresented sequences against. # The file must contain sets of named contaminants in the # form name[tab]sequence. Lines prefixed with a hash will # be ignored. # # -a Specifies a non-default file which contains the list of # --adapters adapter sequences which will be explicitly searched against # the library. The file must contain sets of named adapters # in the form name[tab]sequence. Lines prefixed with a hash # will be ignored. # # -l Specifies a non-default file which contains a set of # --limits criteria which will be used to determine the warn/error # limits for the various modules. This file can also be # used to selectively remove some modules from the output # all together. The format needs to mirror the default # limits.txt file found in the Configuration folder. # # -k --kmers Specifies the length of Kmer to look for in the Kmer # content module. Specified Kmer length must be between # 2 and 10. Default length is 7 if not specified. # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_FastQC unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with FastQC parameters. #params_FastQC = "..." #************************************************************ # Parameters for the FastP Quality Control application # # https://github.com/OpenGene/fastp #************************************************************ [FastP] # OPTIONAL: include_FastP (default NO) # # Set to Yes to run FastP quality control application # on input data. FastP results are placed in a separate # subdirectory within the specified output directory for # later inspection. #include_FastP = yes # OPTIONAL: FastP_generate_output (default NO) # # Set to Yes to generate FastP filtered R1 and R2 files, # which contain only sequences that fulfil the prescribed # quality criteria. Set this parameter to No, if you would # only like to generate FastP reports about input data. #FastP_generate_output = yes # OPTIONAL: FastP_filter (default NO) # # Set to Yes to use FastP filtered input data in further # steps of pipeline processing. This way only sequences # of a prescribed quality take part in scaffold building, # binning, and other analyses. # Set this parameter to No, if you FastP should merely # generate reports and possibly filtered R1 and R2 files, # which are stored on a disk but otherwise do not take # part in further analysis. The entire set of original # R1 and R2 reads takes part in scaffold building, # binning, and other analyses. # # Note 1: if this parameter is set to Yes, then FastP # output is going to be produced regardless # of the above setting FastP_generate_output. # FastP_generate_output will be set to Yes # automatically by Orchestra. # # Note 2: if the above setting FastP_generate_output # is set to Yes, but parameter FastP_filter # is set to No, then FastP output sequences # will be placed in the FastP subdirectory # within the output directory, but they will # not be used further on for the analysis. # The analysis will be executed on the # unaltered input files. # # Note 3: if you set FastP_filter to Yes, then it is # advised to set FastP_generate_output to Yes # manually as well. If you intend to check the # influence of filtering on further results, then # you will probably run Orchestra one time with # FastP_filter = Yes, and another time with # FastP_filter = No. If FastP_generate_output # is set to Yes manually in both cases, then # Orchestra will notice that FastP parameters are # not changed between the two runs, and it will # avoid re-running of FastP. In contrast, if # FastP_generate_output is set to Yes by Orchestra # as a consequence of the setting FastP_filter = Yes, # then this setting is not preserved when # FastP_filter is set to No. Therefore, the FastP # parameters are not preserved as well, which # results in an unnecessary re-rerun of FastP. # Albeit, the only adverse consequence of this # outcome is a bit of a wasted processing time. #FastP_filter = yes # OPTIONAL: params_FastP (quote delimited) # # Specify any algorithmic parameters that FastP knows about. # # Example: params_FastP = "--phred64" # # NOTE 1: this option is available for experienced users. # Leave params_FastP unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with FastP parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of this parameter is passed directly to FastP # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by FastP itself. # Please consult FastP documentation for details. # # # # ----------------------------------------------------------- # # Leave params_FastP unset, if you are not familiar with FastP # parameters. In this case FastP will perform quality scanning # according to its generic settings, which may or may not be # a good starting point. # # Note 1: often, adapters (see below) do not need to be # trimmed by FastP, since sequenators do it by themselves. # # Note 2: default value of parameter --qualified_quality_phred # is 15, which is generally to low. Please consider # increasing it to 30, maybe 35 or even higher. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from FastP help. # If you have questions regarding these parameters, # please consult documentation or authors of FastP. # ----------------------------------------------------------- # # -6, --phred64 indicate the input is using phred64 scoring # (it'll be converted to phred33, so the # output will still be phred33) # # -R, --report_title report title string, default is "fastp report" # # -A, --disable_adapter_trimming adapter trimming is enabled by default. # If this option is specified, adapter # trimming is disabled # # -a, --adapter_sequence the adapter for read1. For SE data, # if not specified, the adapter will be # auto-detected. For PE data, this is used # if R1/R2 are found not overlapped. # (string [=auto]) # # --adapter_sequence_r2 the adapter for read2 (PE data only). # This is used if R1/R2 are found not # overlapped. If not specified, it will be # the same as # (string [=auto]) # # --detect_adapter_for_pe by default, the auto-detection for # adapter is for SE data input only, # turn on this option to enable it for # PE data. # # -f, --trim_front1 trimming how many bases in front for # read1, default is 0 (int [=0]) # # -t, --trim_tail1 trimming how many bases in tail for # read1, default is 0 (int [=0]) # # -b, --max_len1 if read1 is longer than max_len1, then # trim read1 at its tail to make it as # long as max_len1. Default 0 means no # limitation (int [=0]) # # -F, --trim_front2 trimming how many bases in front for # read2. If it's not specified, it will # follow read1's settings (int [=0]) # # -T, --trim_tail2 trimming how many bases in tail for # read2. If it's not specified, it will # follow read1's settings (int [=0]) # # -B, --max_len2 if read2 is longer than max_len2, then # trim read2 at its tail to make it as # long as max_len2. Default 0 means no # limitation. If it's not specified, # it will follow read1's settings # (int [=0]) # # -g, --trim_poly_g force polyG tail trimming, by default # trimming is automatically enabled for # Illumina NextSeq/NovaSeq data # # --poly_g_min_len the minimum length to detect polyG in # the read tail. 10 by default. (int [=10]) # # -G, --disable_trim_poly_g disable polyG tail trimming, by default # trimming is automatically enabled for # Illumina NextSeq/NovaSeq data # # -x, --trim_poly_x enable polyX trimming in 3' ends. # # --poly_x_min_len the minimum length to detect polyX in # the read tail. 10 by default. (int [=10]) # # -5, --cut_front move a sliding window from front (5') # to tail, drop the bases in the window # if its mean quality < threshold, stop # otherwise. # # -3, --cut_tail move a sliding window from tail (3') # to front, drop the bases in the window # if its mean quality < threshold, stop # otherwise. # # -r, --cut_right move a sliding window from front to tail, # if meet one window with mean quality # < threshold, drop the bases in the window # and the right part, and then stop. # # -W, --cut_window_size the window size option shared by # cut_front, cut_tail or cut_sliding. # Range: 1~1000, default: 4 (int [=4]) # # -M, --cut_mean_quality the mean quality requirement option # shared by cut_front, cut_tail or # cut_sliding. Range: 1~36 default: # 20 (Q20) (int [=20]) # # --cut_front_window_size the window size option of cut_front, # default to cut_window_size if not # specified (int [=4]) # # --cut_front_mean_quality the mean quality requirement option for # cut_front, default to cut_mean_quality # if not specified (int [=20]) # # --cut_tail_window_size the window size option of cut_tail, # default to cut_window_size if not # specified (int [=4]) # # --cut_tail_mean_quality the mean quality requirement option for # cut_tail, default to cut_mean_quality # if not specified (int [=20]) # # --cut_right_window_size the window size option of cut_right, # default to cut_window_size if not # specified (int [=4]) # # --cut_right_mean_quality the mean quality requirement option # for cut_right, default to # cut_mean_quality if not specified # (int [=20]) # # -Q, --disable_quality_filtering quality filtering is enabled by default. # If this option is specified, quality # filtering is disabled # # -q, --qualified_quality_phred the quality value that a base is # qualified. Default 15 means phred # quality >=Q15 is qualified. (int [=15]) # # -u, --unqualified_percent_limit how many percents of bases are allowed # to be unqualified (0~100). Default 40 # means 40% (int [=40]) # # -n, --n_base_limit if one read's number of N base is # >n_base_limit, then this read/pair is # discarded. Default is 5 (int [=5]) # # -L, --disable_length_filtering length filtering is enabled by default. # If this option is specified, length # filtering is disabled # # -l, --length_required reads shorter than length_required will # be discarded, default is 15. (int [=15]) # # --length_limit reads longer than length_limit will be # discarded, default 0 means no limitation. # (int [=0]) # # -y, --low_complexity_filter enable low complexity filter. The # complexity is defined as the percentage # of base that is different from its next # base (base[i] != base[i+1]). # # -Y, --complexity_threshold the threshold for low complexity filter # (0~100). Default is 30, which means 30% # complexity is required. (int [=30]) # # --filter_by_index1 specify a file contains a list of # barcodes of index1 to be filtered out, # one barcode per line (string [=]) # # --filter_by_index2 specify a file contains a list of # barcodes of index2 to be filtered out, # one barcode per line (string [=]) # # --filter_by_index_threshold the allowed difference of index barcode # for index filtering, default 0 means # completely identical. (int [=0]) # # -c, --correction enable base correction in overlapped # regions (only for PE data), default # is disabled # # --overlap_len_require the minimum length of the overlapped # region for overlap analysis based adapter # trimming and correction. 30 by default. # (int [=30]) # # --overlap_diff_limit the maximum difference of the overlapped # region for overlap analysis based adapter # trimming and correction. 5 by default. # (int [=5]) # # -U, --umi enable unique molecular identifier (UMI) # preprocessing # # --umi_loc specify the location of UMI, can be # (index1/index2/read1/read2/per_index/per_read, # default is none (string [=]) # # --umi_len if the UMI is in read1/read2, its length # should be provided (int [=0]) # # --umi_prefix if specified, an underline will be used # to connect prefix and UMI (i.e. prefix=UMI, # UMI=AATTCG, final=UMI_AATTCG). No prefix # by default (string [=]) # # --umi_skip if the UMI is in read1/read2, fastp can # skip several bases following UMI, default # is 0 (int [=0]) # # -p, --overrepresentation_analysis enable overrepresented sequence analysis. # # -P, --overrepresentation_sampling one in (--overrepresentation_sampling) # reads will be computed for # overrepresentation analysis (1~10000), # smaller is slower, default is 20. # (int [=20]) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_FastP unset, if you are not familiar with FastP # parameters. In this case FastP will perform quality scanning # according to its generic settings, which may or may not be # a good starting point. # # Note 1: often, adapters (see above) do not need to be # trimmed by FastP, since sequenators do it by themselves. # # Note 2: default value of parameter --qualified_quality_phred # is 15, which is generally to low. Please consider # increasing it to 30, maybe 35 or even higher. #params_FastP = "--qualified_quality_phred 40" #************************************************************ # Parameters for Idba_UD Assembler # # https://i.cs.hku.hk/~alse/hkubrg/projects/idba_ud/ # # This section is relevant only, if input scaffold file # is not specified with parameter input_scaffold_file #************************************************************ [Idba_UD] # OPTIONAL: include_Idba_UD (default NO) # # Set to Yes to run Idba_UD scaffold assembling and # include its results in further analysis. # # When several scaffolds are available (for example due to # specification of previous parameter input_scaffold_file # together with enabling at least one assembler, or by # enabling several assemblers), then Orchestra executes a # Cartesian product of analyses with all available scaffolds # and other multi-choice selections as described further on. #include_Idba_UD = yes # OPTIONAL: params_Idba_UD (quote delimited) # Specify any algorithmic parameters that Idba_UD knows about. # # Example: params_Idba_UD = "--seed_kmer=35" # # NOTE 1: this option is available for experienced users. # Leave params_Idba_UD unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with Idba_UD parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of this parameter is passed directly to Idba_UD # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by Idba_UD itself. # Please consult Idba_UD documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Idba_UD help. # If you have questions regarding these parameters, # please consult documentation or authors of Idba_UD. # ----------------------------------------------------------- # # --mink arg (=20) minimum k value (<=124) # --maxk arg (=100) maximum k value (<=124) # --step arg (=20) increment of k-mer of each iteration # --inner_mink arg (=10) inner minimum k value # --inner_step arg (=5) inner increment of k-mer # --prefix arg (=3) prefix length used to build sub k-mer table # --min_count arg (=2) minimum multiplicity for filtering k-mer when building the graph # --min_support arg (=1) minimum support in each iteration # --seed_kmer arg (=30) seed kmer size for alignment # --min_contig arg (=200) minimum size of contig # --similar arg (=0.95) similarity for alignment # --max_mismatch arg (=3) max mismatch of error correction # --min_pairs arg (=3) minimum number of pairs # --no_bubble do not merge bubble # --no_local do not use local assembly # --no_coverage do not iterate on coverage # --no_correct do not do correction # --pre_correction perform pre-correction before assembly # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_Idba_UD unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with Idba_UD parameters. #params_Idba_UD = "--seed_kmer=35" # OPTIONAL: Idba_UD_variant # Orchestra provides four modifications of the original # Idba_UD algorithm. These are offered due to the # discovered or reported issues with the program usage. # # Possible values are: # # original: the unmodified Idba_UD, as it is distributed by its authors. # # maxShort512 (default): this variant increases a permissive length # of short sequences from 128bp to 512bp. This is expected # to be merely a programming quirk, which does not affect # the generation of a scaffold (but this is not verified). # Idba_UD sometimes crashes without this modification. # # maxShort512_noMergeSimilarPaths: in addition to the change provided # with "maxShort512", this variant also disables function # contig_graph.MergeSimilarPath(). According to some reports # this function has a bug and/or it consumes a lot of RAM # memory, which leads to program crashes. You should try # this variant, if Idba_UD runs out of memory when running # variants "original" or "maxShort512". # # maxShort512_kmer16: in addition to the change provided with # "maxShort512" (but not "maxShort512_noMergeSimilarPaths"), # this variant also increases (presumably) max kmer length # from 4 to 16. The change is introduced, since one of # Idba_UD users reported a more satisfactory operation # with this modification. # # maxShort512_kmer16_noMergeSimilarPaths: in addition to both changes # provided with "maxShort512_kmer16", this variant also # disables function contig_graph.MergeSimilarPath(). # You should try this variant, if Idba_UD runs out of # memory when running variant "maxShort512_kmer16". # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #Idba_UD_variant = original #Idba_UD_variant = maxShort512 #Idba_UD_variant = maxShort512_noMergeSimilarPaths #Idba_UD_variant = maxShort512_kmer16 #Idba_UD_variant = maxShort512_kmer16_noMergeSimilarPaths #************************************************************ # Parameters for MegaHIT Assembler # # https://github.com/voutcn/megahit # # This section is relevant only, if input scaffold file # is not specified with parameter input_scaffold_file #************************************************************ [MegaHIT] # OPTIONAL: include_MegaHIT (default NO) # # Set to Yes to run MegaHIT scaffold assembling # and include its results in further analysis. # # When several scaffolds are available (for example due to # specification of previous parameter input_scaffold_file # together with enabling at least one assembler, or by # enabling several assemblers), then Orchestra executes a # Cartesian product of analyses with all available scaffolds # and other multi-choice selections as described further on. #include_MegaHIT = yes # OPTIONAL: params_MegaHIT (quote delimited) # # Specify any algorithmic parameters that MegaHIT knows about. # # Example: params_MegaHIT = "..." # # NOTE 1: this option is available for experienced users. # Leave params_MegaHIT unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with MegaHIT parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of this parameter is passed directly to MegaHIT # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by MegaHIT itself. # Please consult MegaHIT documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from MegaHIT help. # If you have questions regarding these parameters, # please consult documentation or authors of MegaHIT. # ----------------------------------------------------------- # # Basic assembly options: # --min-count minimum multiplicity for filtering # (k_min+1)-mers [2] # --k-list comma-separated list of kmer size # all must be odd, in the range 15-255, # increment <= 28); [21,29,39,59,79,99,119,141] # # Another way to set --k-list (overrides --k-list if one of them set): # --k-min minimum kmer size (<= 255), must be odd number [21] # --k-max maximum kmer size (<= 255), must be odd number [141] # --k-step increment of kmer size of each iteration (<= 28), # must be even number [12] # # Advanced assembly options: # --no-mercy do not add mercy kmers # --bubble-level intensity of bubble merging (0-2), 0 to disable [2] # --merge-level merge complex bubbles of length <= l*kmer_size and # similarity >= s [20,0.95] # --prune-level strength of low depth pruning (0-3) [2] # --prune-depth remove unitigs with avg kmer depth less than # this value [2] # --low-local-ratio ratio threshold to define low local coverage # contigs [0.2] # --max-tip-len remove tips less than this value [2*k] # --no-local disable local assembly # --kmin-1pass use 1pass mode to build SdBG of k_min # # Presets parameters: # --presets override a group of parameters; possible values: # meta-sensitive: '--min-count 1 --k-list 21,29,39,49, # ...,129,141' # meta-large: '--k-min 27 --k-max 127 --k-step 10' # (large & complex metagenomes, like soil) # # Hardware options: # -m/--memory max memory in byte to be used in SdBG construction # (if set between 0-1, fraction of the machine's # total memory) [0.9] # --mem-flag SdBG builder memory mode # 0: minimum; 1: moderate; others: use all memory # specified by '-m/--memory' [1] # --use-gpu use GPU # --gpu-mem GPU memory in byte to be used. Default: auto detect to # use up all free GPU memory. # # Output options: # --min-contig-len minimum length of contigs to output [200] # --keep-tmp-files keep all temporary files # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_MegaHIT unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with MegaHIT parameters. #params_MegaHIT = "..." #************************************************************ # Parameters for MetaSPAdes Assembler # # http://cab.spbu.ru/software/spades/ # # This section is relevant only, if input scaffold file # is not specified with parameter input_scaffold_file #************************************************************ [MetaSPAdes] # OPTIONAL: include_MetaSPAdes (default NO) # # Set to Yes to run MetaSPAdes scaffold assembling # and include its results in further analysis. # # When several scaffolds are available (for example due to # specification of previous parameter input_scaffold_file # together with enabling at least one assembler, or by # enabling several assemblers), then Orchestra executes a # Cartesian product of analyses with all available scaffolds # and other multi-choice selections as described further on. #include_MetaSPAdes = yes # OPTIONAL: params_MetaSPAdes (quote delimited) # # Specify any algorithmic parameters that MetaSPAdes knows about. # # Example: params_MetaSPAdes = "--rna" # # NOTE 1: this option is available for experienced users. # Leave params_MetaSPAdes unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with MetaSPAdes parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of this parameter is passed directly to MetaSPAdes # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by MetaSPAdes itself. # Please consult MetaSPAdes documentation for details. # # NOTE 5: according to MetaSPAdes instructions, it is recommended # to set parameter/flag "--meta" for metagenomic assembling. # Other supported options are: # --rna: RNA-Seq data sets # --sc: single-cell data sets # --plasmids: plasmids from WGS data sets # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from MetaSPAdes help. # If you have questions regarding these parameters, # please consult documentation or authors of MetaSPAdes. # ----------------------------------------------------------- # # Basic options: # --sc this flag is required for MDA (single-cell) data # --meta this flag is required for metagenomic sample data # --rna this flag is required for RNA-Seq data # --plasmid runs plasmidSPAdes pipeline for plasmid detection # --iontorrent this flag is required for IonTorrent data # # Pipeline options: # --careful tries to reduce number of mismatches and short indels # --disable-rr disables repeat resolution stage of assembling # # Advanced options: # -m/--memory RAM limit for SPAdes in Gb # (terminates if exceeded) [default: 250] # -k comma-separated list of k-mer sizes # (must be odd and less than 128) [default: 'auto'] # --cov-cutoff coverage cutoff value (a positive float number, # or 'auto', or 'off') [default: 'off'] # --phred-offset <33 or 64> PHRED quality offset in the input reads # (33 or 64) [default: auto-detect] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_MetaSPAdes unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with MetaSPAdes parameters. #params_MetaSPAdes = "--meta" #************************************************************ # Parameters for SAM, BAM and abundance files generation # # http://bio-bwa.sourceforge.net/ # http://bowtie-bio.sourceforge.net/bowtie2/ # https://sourceforge.net/projects/bbmap/ # http://www.htslib.org/ #************************************************************ [SAM_BAM] # NOTE conversion of input files to SAM and BAM format # as well as generation of abundance files is done # only, if other pieces of enabled software # (as set below) require these files as their input. # There are no parameters for explicitly enabling # generation of these files, since Orchestra handles # these decisions automatically. #============================================================ # Parameters for SAM file generation #============================================================ # OPTIONAL include_SAM_generation_Bwa (default no) # include_SAM_generation_Bowtie2 (default no) # include_SAM_generation_BBMap (default no) # # Enumeration of desired SAM file generation methods. # Possible choices are Bwa, Bowtie2 and BBMap, # which refer to the names of the respective programs # for SAM file generation (internet links above). # # It is possible to select several methods. Orchestra # generates SAM files according to a Cartesian product # of all above specified scaffolds (enabled assemblers # and potential external scaffold file) and enabled # SAM generation methods. For example, if two assemblers # and all three SAM generation methods are enabled, # then Orchestra generates six different SAM files, # and, consequently, runs each enabled binner for # six times as well. # # Generally, only one SAM-generation method is supposed # to be enabled, unless the intention is to compare # relative performance of these methods. # # If no SAM generation method is enabled, then Orchestra # automatically enables method Bwa for preserving the # continuity with Orchestra versions V1.x, where this # was the only available method. #include_SAM_generation_Bwa = Yes #include_SAM_generation_Bowtie2 = Yes #include_SAM_generation_BBMap = Yes #************************************************************ # relevant only, if Bwa SAM file generation is enabled #************************************************************ # # Generation of SAM file with Bwa is a two-step process. # First scaffold is indexed, and then input sequences # are aligned to the scaffold. This two-step process is # reflected in the choice of parameters below for # fine-tuning each of the two steps. # # # # OPTIONAL: params_bwa_index (quote delimited) # # Specify any algorithmic parameters that Bwa Index # command knows about. # # Example: params_bwa_index = "-a bwtsw" # # NOTE 1: this option is available for experienced users. # Leave params_bwa_index unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with Bwa index parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: do not specify option '-6' or '-p' since these change # output file names. This would crash Orchestra processing. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by Bwa itself. # Please consult Bwa documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Bwa help. # If you have questions regarding these parameters, # please consult documentation or authors of Bwa. # ----------------------------------------------------------- # # -a STR BWT construction algorithm: bwtsw, is or rb2 [auto] # -b INT block size for the bwtsw algorithm # (effective with -a bwtsw) [10000000] # # Warning: `-a bwtsw' does not work for short genomes, # while `-a is' and `-a div' do not work # for long genomes. # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_bwa_index unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with Bwa index parameters. #params_bwa_index = "-a bwtsw" # OPTIONAL: bwa_alignment_algorithm # # Specify Bwa alignment algorithm. Possible values are: # mem (default), bwasw, aln. # # Example: bwa_alignment_algorithm = mem # # This option is available for experienced users. # Leave bwa_alignment_algorithm unset, if you do not # have a specific reason to do otherwise, or if you # are not familiar with Bwa alignment algorithms. #bwa_alignment_algorithm = mem # OPTIONAL: params_bwa_mem (quote delimited) # params_bwa_bwasw (quote delimited) # params_bwa_aln (quote delimited) # # Specify any algorithmic parameters that the respective # Bwa alignment algorithms know about. # # NOTE 1: this option is available for experienced users. # Leave these parameters unset, if you do not have a # specific reason to do otherwise, or if you are # not familiar with Bwa alignment parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: do not specify any options that modify file names. # This would crash Orchestra processing. # # NOTE 4: do not specify number of threads here. # Orchestra does this by itself. # # NOTE 5: the value of each parameter that is not set in this # manner is set to its default value by Bwa itself. # Please consult Bwa documentation for details. # # Leave this parameter unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with Bwa alignment algorithms. #params_bwa_mem = "" #params_bwa_bwasw = "" #params_bwa_aln = "" #************************************************************ # relevant only, if Bowtie2 SAM file generation is enabled #************************************************************ # # Generation of SAM file with Bowtie2 is a two-step process. # First scaffold is indexed, and then input sequences # are aligned to the scaffold. This two-step process is # reflected in the choice of parameters below for # fine-tuning each of the two steps. # # # # OPTIONAL: params_bowtie2_index (quote delimited) # # Specify any algorithmic parameters that bowtie2-build # command for building index knows about. # # Example: params_bowtie2_index = "--noauto" # # NOTE 1: this option is available for experienced users. # Leave params_Bowtie2_index unset, if you do not # have a specific reason to do otherwise, or if # you are not familiar with bowtie2-build parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of each parameter that is not set in # this manner is set to its default value by # bowtie2-build itself. Please consult Bowtie2 # documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Bowtie2 help. # If you have questions regarding these parameters, # please consult documentation or authors of Bowtie2. # ----------------------------------------------------------- # # --large-index force generated index to be 'large', even if ref # has fewer than 4 billion nucleotides # # --verbose log the issued command # # -a/--noauto disable automatic -p/--bmax/--dcv memory-fitting # # -p/--packed use packed strings internally; slower, less memory # # -q/--quiet verbose output (for debugging) # # --bmax max bucket sz for blockwise suffix-array builder # # --bmaxdivn max bucket sz as divisor of ref len (default: 4) # # --dcv diff-cover period for blockwise (default: 1024) # # --nodc disable diff-cover (algorithm becomes quadratic) # # -o/--offrate SA is sampled every 2^ BWT chars (default: 5) # # -t/--ftabchars # of chars consumed in initial lookup (default: 10) # # --seed seed for random number generator # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_bowtie2_index unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with bowtie2-build parameters. #params_bowtie2_index = "--noauto" # OPTIONAL: params_bowtie2_alignment (quote delimited) # # Specify any algorithmic parameters that bowtie2 # command for sequence alignment knows about. # # Example: params_bowtie2_alignment = "--noauto" # # NOTE 1: this option is available for experienced users. # Leave params_bowtie2_alignment unset, if you do not # have a specific reason to do otherwise, or if # you are not familiar with bowtie2 parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of each parameter that is not set in # this manner is set to its default value by # bowtie2 itself. Please consult Bowtie2 # documentation for details. # # ----------------------------------------------------------- # There are very many viable bowtie2 parameters, # and for that matter they are not listed here. # Please consult Bowtie2 documentation. # ----------------------------------------------------------- # # Leave params_bowtie2_alignment unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with bowtie2 parameters. #params_bowtie2_alignment = "--noauto" #************************************************************ # relevant only, if BBMap SAM file generation is enabled #************************************************************ # # Generation of SAM file with BBMap is a two-step process. # First scaffold is indexed, and then input sequences # are aligned to the scaffold. This two-step process is # reflected in the choice of parameters below for # fine-tuning each of the two steps. # # # # OPTIONAL: params_bbmap_index (quote delimited) # # Specify any algorithmic parameters that BBMap # command for building index knows about. # # Example: params_bbmap_index = "k=13" # # NOTE 1: this option is available for experienced users. # Leave params_bbmap_index unset, if you do not # have a specific reason to do otherwise, or if # you are not familiar with BBMap parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of each parameter that is not set in # this manner is set to its default value by # BBMap itself. Please consult BBMap # documentation for details. # # ----------------------------------------------------------- # There are very many viable BBMap parameters, # and for that matter they are not listed here. # Please consult Bowtie2 documentation. # ----------------------------------------------------------- # # Leave params_bbmap_index unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with BBMap parameters. #params_bbmap_index = "k=13" # OPTIONAL: params_bbmap_alignment (quote delimited) # # Specify any algorithmic parameters that BBMap # command for sequence alignment knows about. # # Example: params_bbmap_alignment = "tipsearch=100" # # NOTE 1: this option is available for experienced users. # Leave params_bbmap_alignment unset, if you do not # have a specific reason to do otherwise, or if # you are not familiar with BBMap parameters. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: the value of each parameter that is not set in # this manner is set to its default value by # BBMap itself. Please consult BBMap # documentation for details. # # ----------------------------------------------------------- # There are very many viable BBMap parameters, # and for that matter they are not listed here. # Please consult BBMap documentation. # ----------------------------------------------------------- # # Leave params_bbmap_alignment unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with BBMap parameters. #params_bbmap_alignment = "tipsearch=100" #************************************************************ # parameters for converting resulting SAM files # to a sorted BAM file format #************************************************************ # OPTIONAL: params_samtools_raw_BAM (quote delimited) # # Specify any samtools-view algorithmic parameters for # conversion of a SAM file to a raw (unsorted) BAM file. # # NOTE 1: Parameters "-h -b -@" are already provided by # Orchestra and must not be repeated here. # # NOTE 2: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 3: do not specify any options that modify file names. # This would crash Orchestra processing. # # NOTE 4: do not specify number of threads here. # Orchestra does this by itself. # # NOTE 5: the value of each parameter that is not set in this # manner is set to its default value by Samtools itself. # Please consult Samtools documentation for details. # # Leave this parameter unset, if you do not have a # specific reason to do otherwise, or if you are not # familiar with Samtools parameters. # # However, you may consider a frequent # option "-F 4" for including only mapped # sequences in resulting BAM files. # This option was forced in Orchestra V1.x, # whereas now it is under the control of # a pipeline configurator. #params_samtools_raw_BAM = "-F 4" #************************************************************ # Parameters for execution of MaxBin # # https://sourceforge.net/projects/maxbin/ #************************************************************ [MaxBin] # OPTIONAL: include_MaxBin (default NO) # # Set to Yes to run MaxBin and include its # results in further analysis. #include_MaxBin = yes # OPTIONAL: MaxBin_own_abundance_file (default: Yes) # # Set to Yes to let MaxBin build its own abundance # file, or to No to use abundance file that is # produced by the BBMap's method (from pileup file). # The first choice takes longer to compute, but it # makes MaxBin results more reproducible. #MaxBin_own_abundance_file = No # OPTIONAL: params_MaxBin (quote delimited) # # Specify any algorithmic parameters that MaxBin knows about. # # Example: params_MaxBin = "-prob_threshold 0.7 -markerset 40" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to MaxBin # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: this setting also indirectly affects other binners, # which rely on MaxBin results for their processing. # # NOTE 4: the value of each parameter that is not set in this # manner is set to its default value by MaxBin itself. # Please consult MaxBin documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from MaxBin help. # If you have questions regarding these parameters, # please consult documentation or authors of MaxBin. # ----------------------------------------------------------- # # -min_contig_length minimum contig length. Default 1000 # -max_iteration maximum Expectation-Maximization algorithm # iteration number. Default 50 # -prob_threshold probability threshold for EM final # classification. Default 0.9 # -plotmarker # -markerset marker gene sets, 107 (default) or 40 # # for debug purpose # -verbose # -preserve_intermediate # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_MaxBin unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with MaxBin parameters. #params_MaxBin = "-prob_threshold 0.7 -markerset 40" #************************************************************ # Parameters for execution of Concoct # # https://concoct.readthedocs.io/en/latest/ #************************************************************ [Concoct] # OPTIONAL: include_Concoct (default NO) # # Set to Yes to run Concoct and include its # results in further analysis. #include_Concoct = yes # OPTIONAL: Concoct_abundance_from_MaxBin (default: Yes) # # Set to Yes to let Concoct use abundance file # that was produced by MaxBin, or to No # to use abundance file that is produced by the # BBMap's method (from pileup file). # # NOTE: if MaxBin does not produce its own abundance # file (because MaxBin is not enabled or not # configured to do so), then Concoct automatically # uses the BBMap's abundance file. #Concoct_abundance_from_MaxBin = No # OPTIONAL: params_Concoct (quote delimited) # Specify any algorithmic parameters that Concoct knows about. # # Example: params_Concoct = "--clusters 300 --kmer_length 3" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to Concoct # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by Concoct itself. # Please consult Concoct documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Concoct help. # If you have questions regarding these parameters, # please consult documentation or authors of Concoct. # ----------------------------------------------------------- # # -c CLUSTERS, --clusters CLUSTERS # specify maximal number of clusters # for VGMM, default 400. # # -k KMER_LENGTH, --kmer_length KMER_LENGTH # specify kmer length, default 4. # # -l LENGTH_THRESHOLD, --length_threshold LENGTH_THRESHOLD # specify the sequence length threshold, contigs shorter # than this value will not be included. Defaults to 1000. # # -r READ_LENGTH, --read_length READ_LENGTH # specify read length for coverage, default 100 # # --total_percentage_pca TOTAL_PERCENTAGE_PCA # The percentage of variance explained by the principal # components for the combined data. # -s SEED, --seed SEED # Specify an integer to use as seed for clustering. # 0 gives a random seed, 1 is the default seed and any # other positive integer can be used. Other values give # ArgumentTypeError. # # -i ITERATIONS, --iterations ITERATIONS # Specify maximum number of iterations for the VBGMM. # Default value is 500 # # -e EPSILON, --epsilon EPSILON # Specify the epsilon for VBGMM. Default value is 1.0e-6 # # --no_cov_normalization # By default the coverage is normalized with regards to # samples, then normalized with regards of contigs and # finally log transformed. By setting this flag you skip # the normalization and only do log transform of the # coverage. # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_Concoct unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with Concoct parameters. #params_Concoct = "-l 1000 --total_percentage_pca 60" #params_Concoct = "--converge_out" #************************************************************ # Parameters for execution of MetaBat2 # # https://bitbucket.org/berkeleylab/metabat #************************************************************ [MetaBat2] # OPTIONAL: include_MetaBat2 (default NO) # Set to Yes to include results of MetaBat2 in further analysis. # MetaBat2 is run through its script runMetaBat2.sh. #include_MetaBat2 = yes # OPTIONAL: params_MetaBat2 (quote delimited) # Specify any algorithmic parameters that MetaBat2 SCRIPT knows about. # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to MetaBat2 script # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by MetaBat2 itself. # Please consult MetaBat2 documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Metabat2 help. # If you have questions regarding these parameters, # please consult documentation or authors of MetaBat2. # ----------------------------------------------------------- # # -m [ --minContig ] arg (=2500) # Minimum size of a contig for binning (should be >=1500). # # --maxP arg (=95) # Percentage of 'good' contigs considered for binning decided # by connection among contigs. The greater, the more sensitive. # # --minS arg (=60) # Minimum score of a edge for binning (should be between 1 and 99). # The greater, the more specific. # # --maxEdges arg (=200) # Maximum number of edges per node. The greater, the more sensitive. # # --pTNF arg (=0) # TNF probability cutoff for building TNF graph. # Use it to skip the preparation step. (0: auto). # # --noAdd Turning off additional binning for lost or small contigs. # # -x [ --minCV ] arg (=1) # Minimum mean coverage of a contig in each library for binning. # # --minCVSum arg (=1) # Minimum total effective mean coverage of a contig # (sum of depth over minCV) for binning. # # -s [ --minClsSize ] arg (=200000) Minimum size of a bin as the output. # # --seed arg (=0) For exact reproducibility. (0: use random seed) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_MetaBat2 unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with MetaBat2 parameters. #params_MetaBat2 = "-m 1500 --minS 10 -s 200" #************************************************************ # Parameters for execution of BinSanity # # https://github.com/edgraham/BinSanity #************************************************************ [BinSanity] # OPTIONAL: include_BinSanity_plain (default NO) # Set to Yes to include results of the plain BinSanity binner # (and not BinSanity workflow) in further analysis. #include_BinSanity_plain = yes # OPTIONAL: include_BinSanity_workflow (default NO) # Set to Yes to include results of the BinSanity workflow # (and not plain BinSanity binner) in further analysis. # # Both BinSanity methods may be enabled at the same time # and are executed independently, when both are enabled. #include_BinSanity_workflow = yes # OPTIONAL: include_BinSanity_lc (default NO) # Set to Yes to include results of the BinSanity lc # in further analysis. This binning method consumes # less memory, and can process larger jobs on the # same hardware configuration than Binsanity plain # and workflow incarnations. # # All three BinSanity methods may be enabled at the same time # and are executed independently, when more than # one of them is enabled. #include_BinSanity_lc = yes # OPTIONAL: BinSanity_get_ids_cutoff_size (integer) # Specify cutoff size for BinSanity get-ids script, which builds a list # of valid contigs to be inputted to BinSanity binning. Contigs, smaller # than this value are going to be ignored. # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #BinSanity_get_ids_cutoff_size = 200 # OPTIONAL: BinSanity_profile_transform (quote delimited) # Specify one of the BinSanity-profile transform options. # Valid values are: # scale --> Scaled by multiplying by 100 and log transformed (recommended) # None --> Raw Coverage Values # log --> Log transform # X5 --> Multiplication by 5 # X10 --> Multiplication by 10 # X100 --> multiplication by 100 # SQR --> Square root # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #BinSanity_profile_transform = "scale" # OPTIONAL: params_BinSanity_plain (quote delimited) # Specify any algorithmic parameters that plain BinSanity binner knows about. # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to BinSanity binner # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by BinSanity itself. # Please consult BinSanity documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from BinSanity help. # If you have questions regarding these parameters, # please consult documentation or authors of BinSanity. # ----------------------------------------------------------- # # -p PREFERENCE Specify a preference (default is -3) # Note: decreasing the preference leads to more lumping, # increasing will lead to more splitting. If your range # of coverages are low you will want to decrease the # preference, if you have 10 or less replicates increasing # the preference could benefit you. # # -m MAXITER Specify a max number of iterations [default is 2000] # # -v CONVITER Specify the convergence iteration number (default is 200) # e.g. Number of iterations with no change in the number # of estimated clusters that stops the convergence. # # -d DAMP Specify a damping factor between 0.5 and 1, default is 0.9 # # -x CONTIGSIZE Specify the contig size cut-off [Default 1000 bp] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_BinSanity_plain unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with BinSanity parameters. #params_BinSanity_plain = "-p -6 -m 500 -v 100 -x 200" # OPTIONAL: params_BinSanity_workflow (quote delimited) # Specify any algorithmic parameters that BinSanity workflow knows about. # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to BinSanity # workflow without any checking or formatting. It is the # responsibility of a user to assure the correctness of # these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by BinSanity itself. # Please consult BinSanity documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from BinSanity help. # If you have questions regarding these parameters, # please consult documentation or authors of BinSanity. # ----------------------------------------------------------- # # -p PREFERENCE Specify a preference (default is -3) # Note: decreasing the preference leads to more lumping, # increasing will lead to more splitting. If your range # of coverages are low you will want to decrease the # preference, if you have 10 or less replicates increasing # the preference could benefit you. # # -m MAXITER Specify a max number of iterations [default is 4000] # # -v CONVITER Specify the convergence iteration number (default is 200) # e.g. Number of iterations with no change in the number # of estimated clusters that stops the convergence. # # -d DAMP Specify a damping factor between 0.5 and 1, default is 0.95 # # -x CONTIGSIZE Specify the contig size cut-off [Default 1000 bp] # # --kmer KMER Indicate a number for the kmer calculation, the [Default: 4] # # --refine-preference INPUTREFINEDPREF # Specify a preference for refinement. [Default: -25] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_BinSanity_workflow unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with BinSanity-wf parameters. #params_BinSanity_workflow = "-p -6 -m 500 -v 100 -x 200" # OPTIONAL: params_BinSanity_lc (quote delimited) # Specify any algorithmic parameters that BinSanity lc knows about. # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to # BinSanity lc without any checking or formatting. # It is the responsibility of a user to assure the # correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by BinSanity itself. # Please consult BinSanity documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from BinSanity help. # If you have questions regarding these parameters, # please consult documentation or authors of BinSanity. # ----------------------------------------------------------- # # -p Preference Specify a preference [Default: -3] # Note: decreasing the preference leads to more lumping, # increasing will lead to more splitting. If your range # of coverages are low you will want to decrease the # preference, if you have 10 or less replicates increasing # the preference could benefit you. # # -m MaximumIterations Specify a max number of iterations [Default: 4000] # # -v ConvergenceIterations Specify the convergence iteration number [Default:400] # e.g Number of iterations with no change in the number # of estimated clusters that stops the convergence. # # -d DampeningFactor Specify a damping factor between 0.5 and 1 [Default: 0.95] # # -x SizeCutOff Specify the contig size cut-off [Default:1000 bp] # # --kmer Kmer Indicate a number for the kmer calculation [Default: 4] # # --refine-preference Specify a preference for refinement [Default: -25] # # -C ClusterNumber Indicate a number of initial clusters for kmean [Default:100]# # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_BinSanity_lc unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with BinSanity-lc parameters. #params_BinSanity_lc = "-p -6 -m 500 -v 100 -x 200" #************************************************************ # Parameters for execution of DasTool # # https://github.com/cmks/DAS_Tool #************************************************************ [DasTool] # OPTIONAL: include_DasTool (default NO) # Set to Yes to run DasTool. # # Please note that at least one scaffold, # binner and SAM generation method must # be enabled to run DasTool. # # A separate DasTool analysis is performed # for each available scaffold. An individual # DasTool run takes into accounts results of # all binning steps that are associated with # a certain scaffold. # # If DasTool is not enabled, then bins that # are produced by individual binners are # directly fed to further steps of pipeline # processing (CheckM, ezTree, Prokka-Roary, # FastANI). # If DasTool is enabled, then bins from # individual binners are filtered and # refined by DasTool, and only DasTool- # generated bins are fed to further # steps of analyses. #include_DasTool = yes # OPTIONAL: DasTool_Diamond_search_engine (default NO) # # Set to Yes to select Diamond as a DasTool search engine. # # Default selection is Blast. Engine usearch is not available. # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #DasTool_Diamond_search_engine = yes # OPTIONAL: params_DasTool (quote delimited) # Specify any algorithmic parameters that DasTool knows about. # # Example: params_DasTool = "--score_threshold 0.7" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to DasTool # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by DasTool itself. # Please consult DasTool documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from DasTool help. # If you have questions regarding these parameters, # please consult documentation or authors of DasTool. # ----------------------------------------------------------- # # --score_threshold Score threshold until selection algorithm will # keep selecting bins [0..1]. (default: 0.5) # # --duplicate_penalty Penalty for duplicate single copy genes per bin # (weight b). Only change if you know what you're # doing. [0..3] (default: 0.6) # # --megabin_penalty Penalty for megabins (weight c). Only change if # you know what you're doing. [0..3] (default: 0.5) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_DasTool unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with DasTool parameters. #params_DasTool = "--score_threshold 0.7" #************************************************************ # Parameters for execution of CheckM # analysis of DasTool generated bins # # http://ecogenomics.github.io/CheckM/ #************************************************************ [CheckM] # OPTIONAL: include_CheckM_lineage_wf (default NO) # Set to Yes to run CheckM lineage workflow (lineage_wf). #include_CheckM_lineage_wf = yes # OPTIONAL: params_CheckM_lineage_wf (quote delimited) # Specify any algorithmic parameters that CheckM lineage workflow knows about. # # Example: params_ChekM_lineage_wf = "--length 0.8 --aai_strain 0.85" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to CheckM # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by CheckM itself. # Please consult CheckM documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from CheckM help. # If you have questions regarding these parameters, # please consult documentation or authors of CheckM. # ----------------------------------------------------------- # # -u, --unique UNIQUE minimum number of unique phylogenetic markers # required to use lineage-specific marker set # (default: 10) # # -m, --multi MULTI maximum number of multi-copy phylogenetic markers # before defaulting to domain-level marker set # (default: 10) # # --force_domain use domain-level sets for all bins # # --no_refinement do not perform lineage-specific marker set refinement # # --individual_markers treat marker as independent (i.e., ignore # co-located set structure) # # --skip_adj_correction do not exclude adjacent marker genes when # estimating contamination # # --skip_pseudogene_correction # skip identification and filtering of pseudogenes # # --aai_strain AAI_STRAIN # AAI threshold used to identify strain # heterogeneity (default: 0.9) # # --ignore_thresholds ignore model-specific score thresholds # # -e, --e_value E_VALUE e-value cut off (default: 1e-10) # # -l, --length LENGTH percent overlap between target and query (default: 0.7) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_CheckM_lineage_wf unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with CheckM parameters. #params_CheckM_lineage_wf = "--length 0.8 --aai_strain 0.85" # OPTIONAL: number_of_threads_CheckM_lineage_pplacer # Specify number of threads for executing pplacer as a part of # CheckM lineage workflow. Default is to use the same number of threads # as for CheckM lineage workflow itself. However, memory demand of pplacer # increases linearly with the number of its threads. Consequently, # if Orchestra runs out of memory because pplacer exceeds # memory capacity of the running hardware, then you can manually # decrease its number of threads without unnecessarily lowering # number of threads of other Orchestra parts. #number_of_threads_CheckM_lineage_pplacer = 4 # OPTIONAL: reduced_tree_CheckM_lineage (default NO) # Set to Yes to use reduced tree (requires <16GB of memory) for determining # lineage of each bin. Use this option if your computer has less than about # 40 GB of RAM or, if CheckM_lineage runs out of memory. #reduced_tree_CheckM_lineage = Yes # OPTIONAL: collect_CheckM_filtered_bins (default NO) # Set to Yes to collect bins that fulfill below specified # criterion into separate directory 61_CheckM_filtered_bins. # This eases further downstream analysis of bins outside # of the Orchestra's pipeline. # # NOTE: CheckM lineage workflow needs to be enabled # to filter bins. Otherwise, this parameter is ignored. #collect_CheckM_filtered_bins = Yes # OPTIONAL: copy_CheckM_filtered_bins (default YES) # This parameter is relevant only, if # collect_CheckM_filtered_bins is set to Yes. # Set copy_CheckM_filtered_bins to Yes to make copies of filtered # bins in directory 61_CheckM_filtered_bins. If this parameter # is set to No, then symbolic links to bins are created, which # makes it harder to export bins, but consumes less disk space. #copy_CheckM_filtered_bins = No # ALL FOUR OPTIONAL (numerical values between 0 and 100): # filter_CheckM_lineage_completeness_min # filter_CheckM_lineage_contamination_max # # filter_CheckM_lineage_completeness_max # filter_CheckM_lineage_contamination_min # # Intervals of minimal and maximal completeness and contamination # (as reported by CheckM lineage_wf) that collected bins # must possess in order to be copied or linked to directory # 61_CheckM_filtered_bins. # # Often, only minimal value of completeness and maximal # value of contamination are prescribed, but specifying the # other two as well (or instead of) enables easier studying # of less prominent bins, if such a need arises. # # Example 1: filter_CheckM_lineage_completeness_min = 90 # filter_CheckM_lineage_contamination_max = 5 # ...the other two parameters unspecified # # Example 2: filter_CheckM_lineage_completeness_min = 70 # filter_CheckM_lineage_completeness_max = 90 # filter_CheckM_lineage_contamination_max = 7 # ...the fourth parameter unspecified # # Example 3: filter_CheckM_lineage_completeness_min = 70 # filter_CheckM_lineage_completeness_max = 92 # filter_CheckM_lineage_contamination_min = 4 # filter_CheckM_lineage_contamination_max = 9 # # If some of these parameters is disabled, it does not # impose its respective restriction on collected bins. # # To collect all bins, disable all four limiting parameters. # # NOTE: the above recommendations are only # general and generic. There are situations, # where different settings also make sense. # If you are an expert, then you probably know # better how to set these parameters. #filter_CheckM_lineage_completeness_min = 30 #filter_CheckM_lineage_contamination_max = 90 #filter_CheckM_lineage_completeness_max = 70 #filter_CheckM_lineage_contamination_min = 10 # OPTIONAL: filter_CheckM_FastANI_analysis (default NO) # Set to Yes to run special FastANI analysis on the # resulting set of filtered bins. # Unlike the separate FastANI step, described further on, # no external references need to be provided here, # since all resulting bins constellate query as well as # reference for this multi-query multi-reference FastANI run. # # NOTE: at least two bins need to be collected by # the above filter for this step to run. #filter_CheckM_FastANI_analysis = Yes # OPTIONAL: params_CheckM_FastANI_analysis (quote delimited) # Specify any algorithmic parameters that FastANI knows about. # # Example: params_CheckM_FastANI_analysis = "--fragLen 2000" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to FastANI # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by FastANI itself. # Please consult FastANI documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from FastANI help. # If you have questions regarding these parameters, # please consult documentation or authors of FastANI. # ----------------------------------------------------------- # # -k , --kmer kmer size <= 16 [default : 16] # # --fragLen fragment length [default : 3,000] # # --minFrag minimum matched fragments for # trusting ANI [default : 50] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_FastANI unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with FastANI parameters. # #params_CheckM_FastANI_analysis = "--fragLen 500" # OPTIONAL: include_CheckM_coverage_generation (default NO) # Set to Yes to run CheckM coverage command to produce # coverage file of collected bins. #include_CheckM_coverage_generation = yes # OPTIONAL: params_CheckM_coverage (quote delimited) # Specify any algorithmic parameters that CheckM coverage knows about. # # Example: params_CheckM_coverage = "--min_align 0.95" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to CheckM # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by CheckM itself. # Please consult CheckM documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from CheckM help. # If you have questions regarding these parameters, # please consult documentation or authors of CheckM. # ----------------------------------------------------------- # # -a, --min_align MIN_ALIGN # minimum alignment length as percentage of read length (default: 0.98) # # -e, --max_edit_dist MAX_EDIT_DIST # maximum edit distance as percentage of read length (default: 0.02) # # -m, --min_qc MIN_QC # minimum quality score (in phred) (default: 15) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_CheckM_coverage unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with CheckM parameters. #params_CheckM_coverage = "--min_align 0.95" # OPTIONAL: include_CheckM_taxonomy_wf (default NO) # Set to Yes to run CheckM taxonomy workflow (taxonomy_wf). #include_CheckM_taxonomy_wf = yes # BOTH REQUIRED if CheckM taxonomy workflow is enabled: # CheckM_taxonomy_wf_taxonomic_rank # CheckM_taxonomy_wf_taxon # # Specifies taxonomic rank and taxon, respectively, of interest. # # Permissible values for taxonomic rank are: # life, domain, phylum, class, order, family, genus or species. # # The set of plausible values for taxon changes with the # selected taxonomic rank. # # Example 1: CheckM_taxonomy_wf_taxonomic_rank = Family # CheckM_taxonomy_wf_taxon = Proteobacteriaceae # # Example 2: CheckM_taxonomy_wf_taxonomic_rank = Genus # CheckM_taxonomy_wf_taxon = Bacteroides #CheckM_taxonomy_wf_taxonomic_rank = Genus #CheckM_taxonomy_wf_taxon = Prevotella # OPTIONAL: params_CheckM_taxonomy_wf (quote delimited) # Specify any algorithmic parameters that CheckM taxonomy workflow knows about. # # Example: params_ChekM_taxonomy_wf = "--aai_strain 0.85" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to CheckM # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by CheckM itself. # Please consult CheckM documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from CheckM help. # If you have questions regarding these parameters, # please consult documentation or authors of CheckM. # ----------------------------------------------------------- # # --individual_markers # treat marker as independent (i.e., ignore co-located set structure) # # --skip_adj_correction # do not exclude adjacent marker genes when estimating contamination # # --skip_pseudogene_correction # skip identification and filtering of pseudogenes # # --aai_strain AAI_STRAIN # AAI threshold used to identify strain heterogeneity (default: 0.9) # # --ignore_thresholds # ignore model-specific score thresholds # # -e, --e_value E_VALUE # e-value cut off (default: 1e-10) # # -l, --length LENGTH # percent overlap between target and query (default: 0.7) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_CheckM_taxonomy_wf unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with CheckM parameters. #params_CheckM_taxonomy_wf = "--aai_strain 0.85" #************************************************************ # Parameters for execution of ezTree # analysis of collected bins # # https://github.com/yuwwu/ezTree #************************************************************ [ezTree] # OPTIONAL: include_ezTree (default NO) # Set to Yes to run ezTree. # # CheckM lineage workflow is going to be enabled # regardless of its above settings, if user's # specified bin filtering is applied by setting at # least one of the following parameters (see below): # ezTree_CheckM_lineage_completeness_min # ezTree_CheckM_lineage_contamination_max # ezTree_CheckM_lineage_completeness_max # ezTree_CheckM_lineage_contamination_min #include_ezTree = yes # ALL FOUR OPTIONAL (numerical values between 0 and 100): # ezTree_CheckM_lineage_completeness_min # ezTree_CheckM_lineage_contamination_max # # ezTree_CheckM_lineage_completeness_max # ezTree_CheckM_lineage_contamination_min # # Intervals of minimal and maximal completeness and contamination # (as reported by CheckM lineage_wf) that collected bins # must possess in order to be included in ezTree analysis. # # Often, only minimal value of completeness and maximal # value of contamination are prescribed, but specifying the # other two as well (or instead of) enables easier studying # of less prominent bins, if such a need arises. # # Example 1: ezTree_CheckM_lineage_completeness_min = 90 # ezTree_CheckM_lineage_contamination_max = 5 # ...the other two parameters unspecified # # Example 2: ezTree_CheckM_lineage_completeness_min = 70 # ezTree_CheckM_lineage_completeness_max = 90 # ezTree_CheckM_lineage_contamination_max = 7 # ...the fourth parameter unspecified # # Example 3: ezTree_CheckM_lineage_completeness_min = 70 # ezTree_CheckM_lineage_completeness_max = 92 # ezTree_CheckM_lineage_contamination_min = 4 # ezTree_CheckM_lineage_contamination_max = 9 # # If some of these parameters is disabled, it does not # impose its respective restriction on collected bins. # # To select all collected bins for analysis, disable all # four limiting parameters. In this case CheckM lineage # workflow does not have to be executed (although it # can be, if it is needed for other purposes). # # NOTE 1: ezTree requires bins of a sufficient quality # to produce its results. Also, too many bins # may cause ezTree to fail with a reason that # it is impossible to find any PFAM families # that exist once and only once in all genomes. # # NOTE 2: consequently, it is generally advised # to set parameter # ezTree_CheckM_lineage_completeness_min # to at least a value of 50 (percent) or # greater. Other typical choices are between # 50.0001 and 70, between 70.0001 and 90, # 90.0001 and 100. # At the same time, it is generally advised # to set parameter # ezTree_CheckM_lineage_contamination_max # to at most a value of 5 or 10 (percent). # Failing to do so, will typically result # in inclusion of bins with insufficient # quality in ezTree analysis. # # NOTE 3: the above recommendations are only # general and generic. There are situations, # where different settings also make sense. # If you are an expert, then you probably know # better how to set these parameters. #ezTree_CheckM_lineage_completeness_min = 30 #ezTree_CheckM_lineage_contamination_max = 90 #ezTree_CheckM_lineage_completeness_max = 70 #ezTree_CheckM_lineage_contamination_min = 10 # OPTIONAL: param_ezTree_evalue (default 1e-10) # # Specify ezTree parameter evalue. # # Example: param_ezTree_evalue = 5e-10 # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #param_ezTree_evalue = 5e-10 # OPTIONAL: param_ezTree_model (default JTT) # # Specify ezTree evolutionary model. # Permissive values are JTT, WAG, or LG. # # Leave this parameter unset, if you do not have a # specific reason to do otherwise. #param_ezTree_model = WAG #************************************************************ # Parameters for execution of Prokka # annotation of collected bins # # https://github.com/tseemann/prokka #************************************************************ [Prokka] # OPTIONAL: include_Prokka (default NO) # Set to Yes to run Prokka. # # CheckM lineage workflow is going to be enabled # regardless of its above settings, if user's # specified bin filtering is applied by setting at # least one of the following parameters (see below): # Prokka_CheckM_lineage_completeness_min # Prokka_CheckM_lineage_contamination_max # Prokka_CheckM_lineage_completeness_max # Prokka_CheckM_lineage_contamination_min #include_Prokka = yes # OPTIONAL: Prokka_add_Pfam_database (default NO) # Set to Yes to also take into account Pram HMM # database in addition to databases that are # distributed with Prokka. #Prokka_add_Pfam_database = yes # ALL FOUR OPTIONAL (numerical values between 0 and 100): # Prokka_CheckM_lineage_completeness_min # Prokka_CheckM_lineage_contamination_max # # Prokka_CheckM_lineage_completeness_max # Prokka_CheckM_lineage_contamination_min # # Intervals of minimal and maximal completeness and contamination # (as reported by CheckM lineage_wf) that collected bins # must possess in order to be included in Prokka annotation. # # If some of these parameters is disabled, it does not # impose its respective restriction on collected bins. # # To select all collected bins for analysis, disable all # four limiting parameters. In this case CheckM lineage # workflow does not have to be executed (although it # can be, if it is needed for other purposes). # # Please see a more detailed description of analogous # parameters in the ezTree section above. #Prokka_CheckM_lineage_completeness_min = 30 #Prokka_CheckM_lineage_contamination_max = 90 #Prokka_CheckM_lineage_completeness_max = 70 #Prokka_CheckM_lineage_contamination_min = 10 # OPTIONAL: params_Prokka (quote delimited) # Specify any algorithmic parameters that Prokka knows about. # # Example: params_Prokka = "--metagenome" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to Prokka # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by Prokka itself. # Please consult Prokka documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Prokka help. # If you have questions regarding these parameters, # please consult documentation or authors of Prokka. # ----------------------------------------------------------- # # --addgenes Add 'gene' features for each 'CDS' feature # (default OFF) # # --addmrna Add 'mRNA' features for each 'CDS' feature # (default OFF) # # --locustag [X] Locus tag prefix [auto] (default '') # # --increment [N] Locus tag counter increment (default '1') # # --gffver [N] GFF version (default '3') # # --compliant Force Genbank/ENA/DDJB compliance: --addgenes # --mincontiglen 200 --centre XXX (default OFF) # # --centre [X] Sequencing centre ID. (default '') # # --accver [N] Version to put in Genbank file (default '1') # # # # Organism details: # # --genus [X] Genus name (default 'Genus') # # --species [X] Species name (default 'species') # # --strain [X] Strain name (default 'strain') # # --plasmid [X] Plasmid name or identifier (default '') # # # # Annotations: # # --kingdom [X] Annotation mode: # Archaea|Bacteria|Bacteria|Bacteria|Mitochondria|Viruses # (default 'Bacteria') # # --gcode [N] Genetic code / Translation table # (set if --kingdom is set) (default '0') # # --gram [X] Gram: -/neg +/pos (default '') # # --usegenus Use genus-specific BLAST databases # (needs --genus) (default OFF) # # --proteins [X] FASTA or GBK file to use as 1st priority (default '') # # --hmms [X] Trusted HMM to first annotate from (default '') # # --metagenome Improve gene predictions for highly fragmented genomes # (default OFF) # # --rawproduct Do not clean up /product annotation (default OFF) # # --cdsrnaolap Allow [tr]RNA to overlap CDS (default OFF) # # # # Matching: # # --evalue [n.n] Similarity e-value cut-off (default '1e-09') # # --coverage [n.n] Minimum coverage on query protein (default '80') # # # # Computation: # # --fast Fast mode - only use basic BLASTP databases # (default OFF) # # --noanno For CDS just set /product="unannotated protein" # (default OFF) # # --mincontiglen [N] Minimum contig size [NCBI needs 200] (default '1') # # --rfam Enable searching for ncRNAs with Infernal+Rfam # (SLOW!) (default '0') # # --norrna Don't run rRNA search (default OFF) # # --notrna Don't run tRNA search (default OFF) # # --rnammer Prefer RNAmmer over Barrnap for rRNA prediction # (default OFF) # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_Prokka unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with Prokka parameters. # # However, according to Orchestra typical/expected use cases # flag "--metagenome" should probably be always set. #params_Prokka = "--metagenome" #************************************************************ # Parameters for execution of Roary # building of pan genome from # collected bins and Prokka annotations # # https://github.com/sanger-pathogens/Roary #************************************************************ [Roary] # OPTIONAL: include_Roary (default NO) # Set to Yes to run Roary. # # If this step is enabled, then Prokka is # going to be enabled by Orchestra regardless # of the above Prokka settings. # # Please note that the subset of collected # bins that enter Roary's pan genome build process # is the same as it is inputted to Prokka by the # user's specified criteria in the Prokka # section of this configuration file. #include_Roary = yes # OPTIONAL: params_Roary (quote delimited) # Specify any algorithmic parameters that Roary knows about. # # Example: params_Roary = "-ap" # (allow paralogs in core alignment) # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to Roary # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by Roary itself. # Please consult Roary documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from Roary help. # If you have questions regarding these parameters, # please consult documentation or authors of Roary. # ----------------------------------------------------------- # # -e create a multiFASTA alignment of core genes using PRANK # # -n fast core gene alignment with MAFFT, use with -e # # -i minimum percentage identity for blastp [95] # # -cd FLOAT percentage of isolates a gene must be in to be core [99] # # -g INT maximum number of clusters [50000] # # -s dont split paralogs # # -t INT translation table [11] # # -ap allow paralogs in core alignment # # -y add gene inference information to spreadsheet, # doesnt work with -e # # -iv STR change the MCL inflation value [1.5] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_Roary unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with Roary parameters. #params_Roary = "-ap" #************************************************************ # Parameters for execution of FastANI # alignment-free computing of whole-genome # Average Nucleotide Identity between genomes # # https://github.com/ParBLiSS/FastANI #************************************************************ [FastANI] # In order to successfuly utilize the FastANI part of # Orchestra processing it is necessary to follow the # FastANI instructions and provide Orcherstra an appropriate # external database for genome comparison. File or directory # with external database is specified through configuration # parameters below. # # Database can be either a collection of draft genomes # produced within ones own study, an external narrow database # for analyses of single specific phylogenetic group # (for example databases from D1 to D5 at # http://enve-omics.ce.gatech.edu/data/fastani, # or a general database containing a large number of high # quality genomes (for example the NCBI database at the # same link http://enve-omics.ce.gatech.edu/data/fastani) # for assessing species boundaries of novel cohort of draft # metagenome assembled genomes. # # Genome Taxonomy Database # The users are also directed to consider Orchestra as the first step # (taking the raw sequencing data to metagenome assembled genomes) # to an emerging platform of Genome Taxonomy Database # (https://gtdb.ecogenomic.org) and provides ongoing improvements # in exact prokaryotic genome taxonomy. # OPTIONAL: include_FastANI (default NO) # Set to Yes to run FastANI. # # CheckM lineage workflow is going to be enabled # regardless of its above settings, if user's # specified bin filtering is applied by setting at # least one of the following parameters (see below): # FastANI_CheckM_lineage_completeness_min # FastANI_CheckM_lineage_contamination_max # FastANI_CheckM_lineage_completeness_max # FastANI_CheckM_lineage_contamination_min #include_FastANI = yes # ALL FOUR OPTIONAL (numerical values between 0 and 100): # FastANI_CheckM_lineage_completeness_min # FastANI_CheckM_lineage_contamination_max # # FastANI_CheckM_lineage_completeness_max # FastANI_CheckM_lineage_contamination_min # # Intervals of minimal and maximal completeness and contamination # (as reported by CheckM lineage_wf) that collected bins # must possess in order to be included in FastANI analysis. # # If some of these parameters is disabled, it does not # impose its respective restriction on collected bins. # # To select all collected bins for analysis, disable all # four limiting parameters. In this case CheckM lineage # workflow does not have to be executed (although it # can be, if it is needed for other purposes). # # Please see a more detailed description of analogous # parameters in the ezTree section above. #FastANI_CheckM_lineage_completeness_min = 30 #FastANI_CheckM_lineage_contamination_max = 90 #FastANI_CheckM_lineage_completeness_max = 70 #FastANI_CheckM_lineage_contamination_min = 10 # REQUIRED if include_FastANI=yes: FastANI_analysis_variant # Selects one of the following FastANI processing variants. # # 1ref-1query: user specifies one reference genome; Orchestra # makes a separate FastANI analysis of each eligible # collected bin against the specified reference genome # (behind the scenes Orchestra supplies to FastANI # parameters "--ref" and "--query"). # This is the only mode of operation with a # possibility of generating output mappings for # visualization (FastANI parameter "--visualize", # which is automatically provided by Orchestra). # # 1ref-Mquery: user specifies one reference genome; Orchestra # makes one FastANI analysis for the entire set # of eligible collected bins (behind the scenes # Orchestra supplies to FastANI parameters # "--ref" and "--queryList"). # # Mref-1query: user specifies several reference genomes; Orchestra # makes a separate FastANI analysis of each eligible # collected bin against the specified set of reference # genomes (behind the scenes Orchestra supplies to # FastANI parameters "--refList" and "--query"). # # Mref-Mquery: user specifies several reference genomes; Orchestra # makes one FastANI analysis for the entire set of # eligible collected bins (behind the scenes Orchestra # supplies to FastANI parameters "--refList" and # "--queryList"). # # One and only one of the following selections must to be enabled, # if FastANI processing is enabled. # FastANI_analysis_variant = 1ref-1query # FastANI_analysis_variant = 1ref-Mquery # FastANI_analysis_variant = Mref-1query # FastANI_analysis_variant = Mref-Mquery # REQUIRED if FastANI_analysis_variant is 1ref-1query or 1ref-Mquery: # fastANI_reference_genome # File name of a single reference genome (fasta/fastq/fna)[.gz] # for computing collected bins' Average Nucleotide # Identity against. Some reference genomes may be downloaded from # http://enve-omics.ce.gatech.edu/data/fastani #FastANI_reference_genome = '/home/johnDoe/data/refGenome.fna' # REQUIRED if FastANI_analysis_variant is Mref-1query or Mref-Mquery: # fastANI_reference_genome_pattern # # Linux file pattern that targets several reference genome files. # For example: '/home/johnDoe/data/refGenome*.fna' # # NOTE: it is NOT possible to enumerate several reference genome # files by specifying this parameter more than once. All # required reference genome files must be targeted by # the specified Linux file pattern. # If in doubt, please open Linux terminal in directory with # reference genome files, and execute something like # "ls refGenome*.fna". # # TIP: If your directory with reference genome files contain more files # than you intend to process within an isolated Orchestra run, # and if it is impossible to target only the appropriate subset # of them with a file pattern, then it is possible to proceed # as follows. # # SOLUTION 1: create a new directory (say /home/me/special_refs). # Then copy or move the appropriate subset of # reference genome files to a new directory. # This way it is easy to target only these files # with generic file patterns like # /home/me/special_refs/refGenome*.fna # # SOLUTION 2: create a new directory (say /home/me/special_refs). # Within this directory create symbolic links to the # appropriate files. Symbolic links are created with # Linux command "ln -s /path/to/file /path/to/symlink". # Then target all of linked files with generic patterns # /home/me/special_refs/refGenome*.fna # # Solution 1 is probably easier to do, whereas solution 2 has the # advantage that no files are actually being moved around a file # system. This way, the same files may be linked to different # directories and simultaneously take part of different analysis # combinations without occupying disk space more than once. #FastANI_reference_genome_pattern = '/home/johnDoe/data/refGenome*.fna' # OPTIONAL: params_FastANI (quote delimited) # Specify any algorithmic parameters that FastANI knows about. # # Example: params_FastANI = "--fragLen 2000" # # NOTE 1: you should NOT specify any input and output files, # since Orchestra handles these; only parameters # that affect calculations are feasible here. # # NOTE 2: the value of this parameter is passed directly to FastANI # without any checking or formatting. It is the responsibility # of a user to assure the correctness of these parameters. # # NOTE 3: the value of each parameter that is not set in this # manner is set to its default value by FastANI itself. # Please consult FastANI documentation for details. # # ----------------------------------------------------------- # VIABLE PARAMETERS ARE THE FOLLOWING. # Descriptions are copied verbatim # (or slightly modified) from FastANI help. # If you have questions regarding these parameters, # please consult documentation or authors of FastANI. # ----------------------------------------------------------- # # -k , --kmer kmer size <= 16 [default : 16] # # --fragLen fragment length [default : 3,000] # # --minFrag minimum matched fragments for # trusting ANI [default : 50] # # ----------------------------------------------------------- # END OF VIABLE PARAMETERS. # ----------------------------------------------------------- # # Leave params_FastANI unset, if you do not have a specific # reason to do otherwise, or if you are not familiar # with FastANI parameters. # #params_FastANI = "--fragLen 500"