cd ~/biosoft # 这里是我自己的软件目录,可以根据自己的情况更改wget https://github.com/medema-group/BiG-SCAPE/archive/refs/tags/v1.1.5.zip
unzip BiG-SCAPE-1.1.5.zip
cd BiG-SCAPE-1.1.5
~/biosoft/BiG-SCAPE-1.1.5/bigscape.py -h
usage: BiG-SCAPE [-h][-l LABEL][-i INPUTDIR] -o OUTPUTDIR [--pfam_dir PFAM_DIR][-c CORES][--include_gbk_str INCLUDE_GBK_STR [INCLUDE_GBK_STR ...]][--exclude_gbk_str EXCLUDE_GBK_STR [EXCLUDE_GBK_STR ...]][-v][--include_singletons][-d DOMAIN_OVERLAP_CUTOFF][-m MIN_BGC_SIZE][--mix][--no_classify][--banned_classes {PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others}[{PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others} ...]][--cutoffs CUTOFFS [CUTOFFS ...]][--clans-off][--clan_cutoff CLAN_CUTOFF CLAN_CUTOFF][--hybrids-off][--mode {global,glocal,auto}][--anchorfile ANCHORFILE][--force_hmmscan][--skip_ma][--mibig][--mibig21][--mibig14][--mibig13][--query_bgc QUERY_BGC][--domain_includelist][--version]optional arguments:
-h, --help show this help message and exit 显示此帮助信息并退出
-l LABEL, --label LABEL
An extra label for this run (will be used as part of the folder name within the network_files results) 为此次运行添加一个额外的标签(将在network_files结果文件夹名称中使用)
-i INPUTDIR, --inputdir INPUTDIR
Input directory of gbk files, if left empty, all gbk files in current and lower directories will be used.
gbk文件的输入目录,如果留空,将使用当前和下级目录中的所有gbk文件
-o OUTPUTDIR, --outputdir OUTPUTDIR
Output directory, this will contain all output data files.
输出目录,将包含所有输出数据文件
--pfam_dir PFAM_DIR Location of hmmpress-processed Pfam files. Default is same location of BiG-SCAPE
hmmpress处理过的Pfam文件的位置。默认是BiG-SCAPE的同一位置
-c CORES, --cores CORES
Set the number of cores the script may use (default: use all available cores) 设置脚本可以使用的核心数(默认:使用所有可用的核心)
--include_gbk_str INCLUDE_GBK_STR [INCLUDE_GBK_STR ...] Only gbk files with this string(s) will be used for the analysis (default: 'cluster', 'region'). Use an asterisk
to accept every file (overrides '--exclude_gbk_str') 仅使用包含此字符串的gbk文件进行分析(默认:'cluster', 'region')。使用星号(*)接受所有文件(覆盖'--exclude_gbk_str')
--exclude_gbk_str EXCLUDE_GBK_STR [EXCLUDE_GBK_STR ...] If any string in this list occurs in the gbk filename, this file will not be used for the analysis (default:
final).
如果gbk文件名中出现此列表中的任何字符串,则该文件不会用于分析(默认:final)
-v, --verbose Prints more detailed information. Toggle to activate.
输出更详细的信息。切换以激活
--include_singletons Include nodes that have no edges to other nodes from the network. Toggle to activate.
包含网络中没有与其他节点连接的节点。切换以激活
-d DOMAIN_OVERLAP_CUTOFF, --domain_overlap_cutoff DOMAIN_OVERLAP_CUTOFF
Specify at which overlap percentage domains are considered to overlap. Domain with the best score is kept (default=0.1).
指定在何种重叠百分比下域被认为是重叠的。保留得分最高的域(默认=0.1)
-m MIN_BGC_SIZE, --min_bgc_size MIN_BGC_SIZE
Provide the minimum size of a BGC to be included in the analysis. Default is 0 base pairs
提供要包含在分析中的BGC的最小大小。默认是0个碱基对
--mix By default, BiG-SCAPE separates the analysis according to the BGC product (PKS Type I, NRPS, RiPPs, etc.) and will
create network directories for each class. Toggle to include an analysis mixing all classes
默认情况下,BiG-SCAPE根据BGC产物(PKS Type I, NRPS, RiPPs等)分开分析,并为每个类别创建网络目录。切换以包含混合所有类别的分析
--no_classify By default, BiG-SCAPE classifies the output files analysis based on the BGC product. Toggle to deactivate (note
that if the --mix parameter is not activated, BiG-SCAPE will not create any network file).
默认情况下,BiG-SCAPE根据BGC产物分类输出文件分析。切换以停用(注意,如果未激活--mix参数,BiG-SCAPE将不会创建任何网络文件)
--banned_classes {PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others}[{PKSI,PKSother,NRPS,RiPPs,Saccharides,Terpene,PKS-NRP_Hybrids,Others} ...] Classes that should NOT be included in the classification. E.g. "--banned_classes PKSI PKSOther" 不应包括在分类中的类别。例如:“--banned_classes PKSI PKSOther”
--cutoffs CUTOFFS [CUTOFFS ...] Generate networks using multiple raw distance cutoff values. Values should be in the range [0.0, 1.0]. Example:
--cutoffs 0.1 0.25 0.5 1.0. Default: c=0.3.
使用多个原始距离截断值生成网络。值应在[0.0, 1.0]范围内。例如:--cutoffs 0.1 0.25 0.5 1.0。默认值:c=0.3
--clans-off Toggle to deactivate a second layer of clustering to attempt to group families into clans
切换以停用第二层聚类以尝试将家族分组为氏族
--clan_cutoff CLAN_CUTOFF CLAN_CUTOFF
Cutoff Parameters for which clustering families into clans will be performed in raw distance. First value is the
cutoff value family assignments for BGCs used in clan clustering (default: 0.3). Second value is the cutoff value
for clustering families into clans (default: 0.7). Average linkage for BGCs in a family is used for distances
between families. Valid values are in the range [0.0, 1.0]. Example: --clan_cutoff 0.3 0.7) 用于将家族聚类为氏族的原始距离截断参数。
第一个值是用于氏族聚类的BGC家族分配的截断值(默认:0.3)。
第二个值是将家族聚类为氏族的截断值(默认:0.7)。
家族中BGC的平均链接用于家族之间的距离。有效值在[0.0, 1.0]范围内。例如:--clan_cutoff 0.3 0.7
--hybrids-off Toggle to also add BGCs with hybrid predicted products from the PKS/NRPS Hybrids and Others classes to each
subclass (e.g. a 'terpene-nrps' BGC from Others would be added to the Terpene and NRPS classes) 切换以将具有PKS/NRPS混合和其他类的混合预测产物的BGC添加到每个子类(例如,将Others中的'萜类-nrps' BGC添加到萜类和NRPS类)
--mode {global,glocal,auto} Alignment mode for each pair of gene clusters. 'global': the whole list of domains of each BGC are compared;'glocal': Longest Common Subcluster mode. Redefine the subset of the domains used to calculate distance by trying
to find the longest slice of common domain content per gene in both BGCs, then expand each slice. 'auto': use
glocal when at least one of the BGCs in each pair has the 'contig_edge' annotation from antiSMASH v4+, otherwise
use global mode on that pair
每对基因簇的比对模式。
'global':比较每个BGC的整个域列表;'glocal':最长公共子簇模式。
通过尝试查找每个BGC中每个基因的最长公共域内容片段,然后扩展每个片段来重新定义用于计算距离的域子集。
'auto':当每对中至少有一个BGC具有antiSMASH v4+的'contig_edge'注释时,使用glocal模式,否则对该对使用global模式
--anchorfile ANCHORFILE
Provide a custom location for the anchor domains file, default is anchor_domains.txt.
提供锚域文件的自定义位置,默认是anchor_domains.txt
--force_hmmscan Force domain prediction using hmmscan even if BiG-SCAPE finds processed domtable files (e.g. to use a new version
of PFAM).
即使BiG-SCAPE找到处理过的domtable文件,也强制使用hmmscan进行域预测(例如使用新版本的PFAM)
--skip_ma Skip multiple alignment of domains\'sequences. Use if alignments have been generated in a previous run.
跳过域序列的多重比对。如果比对已在之前的运行中生成,请使用此选项
--mibig Include MIBiG 3.1 BGCs as reference (https://mibig.secondarymetabolites.org/). These BGCs will only be kept if they are connected to a region in the input set(distance < max(cutoffs)).
包含MIBiG 3.1 BGC作为参考(https://mibig.secondarymetabolites.org/ )。
只有当这些BGC与输入集中的某个区域相连(距离<最大截断值)时才会保留
--mibig21 Include BGCs from version 2.1 of MIBiG
包含MIBiG 2.1版本中的BGC
--mibig14 Include BGCs from version 1.4 of MIBiG
包含MIBiG 1.4版本中的BGC
--mibig13 Include BGCs from version 1.3 of MIBiG
包含MIBiG 1.3版本中的BGC
--query_bgc QUERY_BGC
Instead of making an all-VS-all comparison of all the input BGCs, choose one BGC to compare with the rest of the
set(one-VS-all). The query BGC does not have to be within inputdir
不进行所有输入BGC的全部对比,而是选择一个BGC与其余的集合进行对比(单一对比全部)。查询BGC不必在inputdir中
--domain_includelist Only analyze BGCs that include domains with the pfam accessions found in the domain_includelist.txt file
仅分析包含domain_includelist.txt文件中pfam登录名的BGC
--version show program\'s version number and exit 显示程序的版本号并退出
Example
1
2
3
4
5
6
7
8
mkdir BiG-SCAPE_example
cd BiG-SCAPE_example
wget https://zenodo.org/record/1340617/files/gbks.tar.gz?download=1tar -zxvf gbks.tar.gz
tree gbks/
mkdir gbk_files
#MAG_BGC文件夹下是所有的MAG的BGC文件夹,每个文件夹下有多个gbk文件ls MAG_BGC/ >MAG_list
for i in `cat MAG_list`doecho$i#把*region*.gbk提取出来,如果没有region就跳过,如果是cluster就修改一下 find MAG_BGC/$i -name "*region*.gbk" >tmp_list
for j in `cat tmp_list`do#重命名成genome_name_region的形式并拷贝到一个目录 cp $j gbk_files/${i}_$(basename $j)donedone