-
Notifications
You must be signed in to change notification settings - Fork 5
/
phyml_DNAmodelFinder.sh
executable file
·1288 lines (1098 loc) · 50.3 KB
/
phyml_DNAmodelFinder.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env bash
#: phyml_DNAmodelFinder.sh
#: Author: Pablo Vinuesa, CCG-UNAM, @pvinmex, https://www.ccg.unam.mx/~vinuesa/
#: AIM: wraper script around phyml, to select a reasonable substitution model for DNA alignments.
#: Computes AIC, BIC, delta_BIC and BICw, and estimate a ML phylogeny using the best-fitting model
#: selected under the BIC.
#: LICENSE: GPL v3.0. See https://github.com/vinuesa/get_phylomarkers/blob/master/LICENSE
#: Design: phyml_DNAmodelFinder.sh evaluates the named parametric substitution models
# currently implemented in phml v3.*, combining them or not with +G and/or +I
# - Nucleotide-based models : HKY85 (default) | JC69 | K80 | F81 | F84 | TN93 | GTR
# Under runmode 2, the set is significantly expanded, by adding equal|unequal frquency
# model sets, which are automatically selected based on delta_BIC between JC69 and F81
# The models are fitted using a fixed NJ-JC tree, optimizing branch lenghts and rates, in order
# to calulate each model's AIC, BIC, delta_BIC and BICw.
# The best model is selected by BIC
#----------------------------------------------------------------------------------------
#: LICENSE: GPL v3.0. See https://github.com/vinuesa/get_phylomarkers/blob/master/LICENSE
#----------------------------------------------------------------------------------------
#: DISCLAIMER
#: THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
#: APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
#: HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
#: OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
#: THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
#: PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
#: IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
#: ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
#----------------------------------------------------------------------------------------
#: NOTES:
#: 1. For a versatile and highly customizable pan-genome analysis
#: software package, consider using GET_HOMOLOGUES
#: https://doi.org/10.1128%2FAEM.02411-13
#: https://github.com/eead-csic-compbio/get_homologues
#: https://hub.docker.com/u/eeadcsiccompbio
#:
#: 2. To perform core- and/or pan-genome phylogenomic analyses
#: consider using the GET_PHYLOMARKERS package
#: https://doi.org/10.3389/fmicb.2018.00771
#: https://github.com/vinuesa/get_phylomarkers
#: https://hub.docker.com/r/vinuesa/get_phylomarkers
#----------------------------------------------------------------------------------------
#: GitHub repo: you can fetch the latest version of the script from:
# https://github.com/vinuesa/TIB-filoinfo/blob/master/phyml_DNAmodelFinder.sh
# wget -c https://raw.githubusercontent.com/vinuesa/TIB-filoinfo/master/phyml_DNAmodelFinder.sh
#----------------------------------------------------------------------------------------
# set Bash's unofficial strict mode
set -euo pipefail
host=$(hostname)
progname=${0##*/}
version='1.2.1_2024-11-20' # v1.2.1_2024-11-20; added citation
# phyml_DNAmodelFinder.sh v1.2.0_2023-11-18;
# - implements extended (base) model set:
# * 6 standard named models
# * 64 extended equal-frequency (ef) models (TIM and TVM sets)
# * 62 extended unequal-frequency (uf) models (TIM and TVM sets)
# - minor code cleanup (removed unused variable)
min_bash_vers=4.4 # required to write modern bash idioms:
# 1. printf '%(%F)T' '-1' in print_start_time; and
# 2. passing an array or hash by name reference to a bash function (since version 4.3+),
# by setting the -n attribute
# see https://stackoverflow.com/questions/16461656/how-to-pass-array-as-an-argument-to-a-function-in-bash
min_phyml_version=3.3 # this corresponds to 3.3.year; check_phyml_version also extracts the year, suggesting to update to v2022
phymlv=''
phymlyr=''
DEBUG=0
n_starts=1 # seed trees
delta_BIC_cutoff=2 # to set compositional_heterogeneity, transitional_heterogeneity and pInv flags
# declare array and hash variables
declare -a models # array holding the base models (empirical substitution matrices to be evaluated)
declare -A model_cmds # hash holding the commands for each model
declare -A model_scores # hash holding the model lnL scores and AICi values
declare -A model_options # hash mapping option => model_set
declare -A model_free_params # hash mapping base models with their free parameters
declare -a TIMef TVMef TIMuf TVMuf # array holding the model codes
# To be implemented
#named_model_codes[JC]=000000
#named_model_codes[F81]=000000
#named_model_codes[K80]=010010
#named_model_codes[HKY]=010010
#named_model_codes[TN]=010020
#named_model_codes[TNe]=010020
#named_model_codes[K81]=012210 # TPM1
#named_model_codes[K81u]=012210 # TPM1
#named_model_codes[TPM2]=010212
#named_model_codes[TPM2u]=010212
#named_model_codes[TPM3]=012012
#named_model_codes[TPM3u]=012012
#named_model_codes[TIM]=012230
#named_model_codes[TIMe]=012230
#named_model_codes[TIM2]=010232
#named_model_codes[TIM2e]=010232
#named_model_codes[TIM3]=012032
#named_model_codes[TIM3e]=012032
#named_model_codes[TVM]=012314
#named_model_codes[TVMe]=012314
#named_model_codes[SYM]=012345
#named_model_codes[GTR]=012345
# standard_models # 6
model_free_params['JC69']=0
model_free_params['K80']=1
model_free_params['F81']=3
model_free_params['HKY85']=4
model_free_params['TN93']=5
model_free_params['GTR']=8
### extended_models_ef - TVM* # 15
model_free_params['012210ef']=2 # K81
model_free_params['012314ef']=4 # TVMef
model_free_params['012310ef']=3 # TVM1ef
model_free_params['010213ef']=3 # TVM2ef
model_free_params['012213ef']=3 # TVM3ef
model_free_params['012013ef']=3 # TVM4ef
model_free_params['010012ef']=2 # TVM5ef
model_free_params['012012ef']=2 # TVM6ef
model_free_params['010212ef']=2 # TVM7ef
model_free_params['010210ef']=2 # TVM8ef
model_free_params['012313ef']=3 # TVM9ef
model_free_params['010011ef']=1 # TVM10ef
model_free_params['012212ef']=2 # TVM11ef
model_free_params['011010ef']=1 # TVM12ef
model_free_params['001101ef']=1 # TVM13ef
#==
model_free_params['000100ef']=1 # TVM14ef
model_free_params['000101ef']=1 # TVM15ef
model_free_params['010110ef']=1 # TVM16ef
model_free_params['000102ef']=2 # TVM17ef
model_free_params['010112ef']=2 # TVM18ef
model_free_params['010211ef']=2 # TVM19ef
TVMef[0]=012210
TVMef[1]=012314
TVMef[2]=012310
TVMef[3]=010213
TVMef[4]=012213
TVMef[5]=012013
TVMef[6]=010012
TVMef[7]=012012
TVMef[8]=010212
TVMef[9]=010210
TVMef[10]=012313
TVMef[11]=010011
TVMef[12]=012212
TVMef[13]=011010
TVMef[14]=001101
#==
TVMef[15]=000100
TVMef[16]=000101
TVMef[17]=010110
TVMef[18]=000102
TVMef[19]=010112
TVMef[20]=010211
### extended_models_ef - TNef|TIM*|SYM # 26
model_free_params['010020ef']=2 # TNef
model_free_params['012230ef']=3 # TIMef
model_free_params['012345ef']=5 # SYMef
model_free_params['010023ef']=3 # TIM1ef
model_free_params['010232ef']=4 # TIM2ef
model_free_params['012232ef']=3 # TIM3ef
model_free_params['012332ef']=3 # TIM4ef
model_free_params['012342ef']=4 # TIM5ef
model_free_params['012343ef']=4 # TIM6ef
model_free_params['012340ef']=4 # TIM7ef
model_free_params['010021ef']=2 # TIM8ef
model_free_params['010022ef']=2 # TIM9ef
model_free_params['011123ef']=3 # TIM10ef
model_free_params['012222ef']=2 # TIM11ef
model_free_params['012223ef']=3 # TIM12ef
model_free_params['010120ef']=2 # TIM13ef
model_free_params['000120ef']=2 # TIM14ef
model_free_params['000121ef']=2 # TIM15ef
model_free_params['001021ef']=2 # TIM16ef
model_free_params['012234ef']=4 # TIM17ef
model_free_params['010231ef']=3 # TIM18ef
model_free_params['011230ef']=3 # TIM19ef
model_free_params['011020ef']=2 # TIM20ef
model_free_params['012130ef']=3 # TIM21ef
model_free_params['010121ef']=2 # TIM22ef
model_free_params['010122ef']=2 # TIM23ef
model_free_params['010123ef']=3 # TIM24ef
#==
model_free_params['001020ef']=2 # TIM25ef
model_free_params['000123ef']=3 # TIM26ef
model_free_params['010203ef']=3 # TIM27ef
model_free_params['010223ef']=3 # TIM28ef
model_free_params['010230ef']=3 # TIM29ef
model_free_params['012032ef']=3 # TIM30ef # < TIM3
model_free_params['010233ef']=3 # TIM31ef
model_free_params['001234ef']=4 # TIM32ef
model_free_params['010234ef']=4 # TIM33ef
model_free_params['011234ef']=4 # TIM34ef
model_free_params['012234ef']=4 # TIM35ef
model_free_params['012134ef']=4 # TIM36ef
model_free_params['012304ef']=4 # TIM37ef
model_free_params['012324ef']=4 # TIM38ef
model_free_params['012334ef']=4 # TIM39ef
model_free_params['012341ef']=4 # TIM40ef
model_free_params['012344ef']=4 # TIM41ef
TIMef[0]=010020 #TNef
TIMef[1]=012230
TIMef[2]=012345
TIMef[3]=010023
TIMef[4]=010232
TIMef[5]=012232
TIMef[6]=012332
TIMef[7]=012342
TIMef[8]=012343
TIMef[9]=012340
TIMef[10]=010021
TIMef[11]=010022
TIMef[12]=011123
TIMef[13]=012223
TIMef[14]=010120
TIMef[15]=000120
TIMef[16]=000121
TIMef[17]=001021
TIMef[18]=012234
TIMef[19]=010231
TIMef[20]=011230
TIMef[21]=011020
TIMef[22]=012130
TIMef[23]=010121
TIMef[24]=010122
TIMef[25]=010123
#==
TIMef[26]=001020
TIMef[27]=000123
TIMef[28]=010203
TIMef[29]=010223
TIMef[30]=010230
TIMef[31]=012032
TIMef[32]=010233
TIMef[33]=001234
TIMef[34]=010234
TIMef[35]=011234
TIMef[36]=012134
TIMef[37]=012222
TIMef[38]=012304
TIMef[39]=012324
TIMef[40]=012334
TIMef[41]=012341
TIMef[42]=012344
## extended_models_uf TVM* # 16
model_free_params['012210']=5 # K81uf # K81=TPM1
model_free_params['012314']=7 # TVM
model_free_params['012310']=6 # TVM1
model_free_params['010213']=6 # TVM2
model_free_params['012213']=6 # TVM3
model_free_params['012013']=6 # TVM4
model_free_params['010012']=5 # TVM5
model_free_params['012012']=5 # TVM6 # TPM3
model_free_params['010212']=5 # TVM7 # TPM2
model_free_params['012313']=6 # TVM8
model_free_params['010011']=4 # TVM9
model_free_params['012212']=5 # TVM10
model_free_params['010210']=5 # TVM11
model_free_params['011010']=4 # TVM12
model_free_params['001101']=4 # TVM13
#==
model_free_params['000100']=4 # TVM14
model_free_params['000101']=4 # TVM15
model_free_params['010110']=4 # TVM16
model_free_params['000102']=5 # TVM17
model_free_params['010112']=5 # TVM18
model_free_params['010211']=5 # TVM19
TVMuf[0]=012210
TVMuf[1]=012314
TVMuf[2]=012310
TVMuf[3]=010213
TVMuf[4]=012213
TVMuf[5]=012013
TVMuf[6]=010012
TVMuf[7]=012012 # TPM3
TVMuf[8]=010212
TVMuf[9]=012313
TVMuf[10]=010011
TVMuf[11]=012212
TVMuf[12]=010210
TVMuf[13]=011010
TVMuf[14]=001101
#==
TVMuf[15]=000100
TVMuf[16]=000101
TVMuf[17]=010110
TVMuf[18]=000102
TVMuf[19]=010112
TVMuf[20]=010211
## extended_models_uf TIM* # 25
model_free_params['012230']=6 # TIM # < TIM1
model_free_params['010023']=6 # TIM1
model_free_params['010232']=7 # TIM2 # < TIM2
model_free_params['012232']=6 # TIM3
model_free_params['012332']=6 # TIM4
model_free_params['012342']=7 # TIM5
model_free_params['012343']=7 # TIM6
model_free_params['012340']=7 # TIM7
model_free_params['010021']=5 # TIM8
model_free_params['010022']=5 # TIM9
model_free_params['010120']=5 # TIM10
model_free_params['011123']=6 # TIM11
model_free_params['012223']=6 # TIM12
model_free_params['012222']=5 # TIM13
model_free_params['000120']=5 # TIM14
model_free_params['000121']=5 # TIM15
model_free_params['001021']=5 # TIM16
model_free_params['012234']=7 # TIM17
model_free_params['010231']=6 # TIM18
model_free_params['011020']=5 # TIM19
model_free_params['011230']=6 #
model_free_params['012130']=6 # TIM20
model_free_params['010121']=5 # TIM21
model_free_params['010122']=5 # TIM22
model_free_params['010123']=6 # TIM23
#==
model_free_params['001020']=5 # TIM24
model_free_params['000123']=6 # TIM25
model_free_params['010203']=6 # TIM26
model_free_params['010223']=6 # TIM27
model_free_params['010230']=6 # TIM28
model_free_params['012032']=6 # TIM29 # < TIM3
model_free_params['010233']=6 # TIM30
model_free_params['001234']=7 # TIM31
model_free_params['010234']=7 # TIM32
model_free_params['011234']=7 # TIM33
model_free_params['012234']=7 # TIM34
model_free_params['012134']=7 # TIM35
model_free_params['012304']=7 # TIM36
model_free_params['012324']=7 # TIM37
model_free_params['012334']=7 # TIM38
model_free_params['012341']=7 # TIM39
model_free_params['012344']=7 # TIM40
TIMuf[0]=012230
TIMuf[1]=010023
TIMuf[2]=010232
TIMuf[3]=012232
TIMuf[4]=012332
TIMuf[5]=012342
TIMuf[6]=012343
TIMuf[7]=012340
TIMuf[8]=010021
TIMuf[9]=010022
TIMuf[10]=010120
TIMuf[11]=011123
TIMuf[12]=012223
TIMuf[13]=012222
TIMuf[14]=000120
TIMuf[15]=000121
TIMuf[16]=001021
TIMuf[17]=012234
TIMuf[18]=010231
TIMuf[19]=011020
TIMuf[20]=011230
TIMuf[21]=012130
TIMuf[22]=010121
TIMuf[23]=010122
TIMuf[24]=010123
#==
TIMuf[25]=001020
TIMuf[26]=000123
TIMuf[27]=010203
TIMuf[28]=010223
TIMuf[29]=010230
TIMuf[30]=012032
TIMuf[31]=010233
TIMuf[32]=001234
TIMuf[33]=010234
TIMuf[34]=011234
TIMuf[35]=012134
TIMuf[36]=012304
TIMuf[37]=012324
TIMuf[38]=012334
TIMuf[39]=012341
TIMuf[40]=012344
# array of models to evaluate
standard_models=(JC69 K80 F81 HKY85 TN93 GTR)
extended_models_ef=( "${TIMef[@]}" "${TVMef[@]}" ) # 64
extended_models_uf=( "${TIMuf[@]}" "${TVMuf[@]}" ) # 62
test_models=(JC69 K80 F81 HKY85 TN93)
# hash mapping option => model_set
model_options['1']='standard_models'
model_options['2']='extended_models'
model_options['3']='test_models'
#==============================#
# >>> FUNCTION DEFINITIONS <<< #
#------------------------------#
function check_dependencies()
{
declare -a progs required_binaries optional_binaries
local p programname
required_binaries=(awk bc sed perl phyml)
optional_binaries=(mpirun phyml-mpi)
for p in "${optional_binaries[@]}"
do
if type -P "$p" >/dev/null
then
progs=("${optional_binaries[@]}")
else
progs=()
fi
done
progs+=("${required_binaries[@]}")
for programname in "${progs[@]}"
do
if ! type -P "$programname"; then # print paths of binaries to STDOUT
echo
echo "$# ERROR: $programname not in place!"
echo " ... you will need to install \"$programname\" first, or include it in \$PATH"
echo " ... exiting"
exit 1
else
continue
fi
done
echo
echo '# Run check_dependencies() ... looks good: all required binaries are in place.'
echo
}
#-----------------------------------------------------------------------------------------
function check_bash_version()
{
local bash_vers min_bash_vers
min_bash_vers=$1
bash_vers=$(bash --version | awk 'NR==1{print $4}' | sed 's/(.*//' | cut -d. -f1,2)
# float comparisons using bc
if [[ 1 -eq "$(echo "$bash_vers < $min_bash_vers" | bc)" ]]
then
echo "# FATAL: you are using the old bash v.${bash_vers}
but $progname requires bash >= v${min_bash_vers}
to use hashes and other goodies"
exit 1
fi
echo "$bash_vers"
}
#-----------------------------------------------------------------------------------------
function check_phyml_version {
local phyml_version min_phyml_version phyml_version_year
min_phyml_version=$1
phyml_version_year=''
phyml_version=''
# This version extracts 3.3 from '3.3.20220408.'; but the series goes back to 2017 ...
# v3.3.20220408 is required to have the --leave_duplicates option
phyml_version=$(phyml --version | awk '/This is PhyML version/{print substr($NF, 0, 4)}' | sed 's/\.$//')
#phyml_OK=$(awk -v v="$phyml_version" -v m="$min_phyml_version" 'BEGIN{if(v < m || v > 2000) { print 0}else{print 1} }')
if [[ 1 -eq "$(echo "$phyml_version == $min_phyml_version" | bc)" ]]
then
phyml_version_year=$(phyml --version | awk '/This is PhyML version/{print substr($NF, 0, 8)}' | sed 's/.*\.//g')
else
phyml_version_year="$phyml_version"
fi
printf '%s\n' "${phyml_version}_${phyml_version_year}"
}
#-----------------------------------------------------------------------------------------
function check_is_phylip()
{
local phylip
phylip=$1
if ! awk 'NR==1 && NF==2' "$phylip" &> /dev/null; then
echo "FATAL ERROR: input file $phylip does not seem to by a canonical phylip alingment"
print_help
fi
}
#-----------------------------------------------------------------------------------------
function compute_nt_freq_in_phylip()
{
local phylip
phylip=$1
awk '
BEGIN{print "idx\tNT\tobs_freq"}
{
# ignore first row and column
if( NR > 1 && NF > 1){
# remove empty spaces
gsub(/[ ]+/," ")
l=length($0)
for(i=1; i<=l; i++){
c = substr($0, i, 1)
# count only standard amino acids
if (c ~ /[ACGT]/){
ccounts[c]++
letters++
}
}
}
}
# print relative frequency of each residue
END {
for (c in ccounts){
aa++
printf "%i\t%s\t%.4f\n", aa, c, (ccounts[c] / letters )
}
}' "$phylip"
}
#-----------------------------------------------------------------------------------------
function print_start_time()
{
#echo -n "[$(date +%T)] "
printf '%(%T )T' '-1' # requires Bash >= 4.3
}
#-----------------------------------------------------------------------------------------
function compute_AICi()
{
# AICi=$(compute_AICi "$score" "$total_params")
local score total_params
score=$1
total_params=$2
# AICi=-2*lnLi + 2*Ni
echo "(-2 * $score) + (2 * $total_params)" | bc -l
}
#-----------------------------------------------------------------------------------------
function compute_AICc()
{
# AICc=$(compute_AICc "$score" "$total_params" "$no_sites" "$AICi")
local score extra_params total_params n_sites AIC
score=$1
total_params=$2
n_sites=$3
AIC=$4
# AICi=-2*lnLi + 2*Ni
#AIC=$( echo "(-2 * $score) + (2 * $total_params)" | bc -l )
#echo $AIC + (2 * $total_params($total_params + 1)/($n_sites - $total_params -1)) | bc
echo "$AIC + ( 2 * ($total_params * ($total_params + 1))/($n_sites - $total_params -1) )" | bc -l
}
#-----------------------------------------------------------------------------------------
function compute_BIC()
{
# BICi=$(compute_BIC "$score" "$total_params" "$no_sites")
local score extra_params total_params n_sites
score=$1
total_params=$2
n_sites=$3
# BICi= k*ln(n) -2*lnLi
awk -v lnL="$score" -v k="$total_params" -v n="$n_sites" 'BEGIN{ BIC= (-2 * lnL) + (k * log(n)); printf "%0.5f", BIC }'
}
#-----------------------------------------------------------------------------------------
function check_compositional_heterogeneity()
{
# returns 1|0 if there is or not significant compositional_heterogeneity based on delta_AIC > 2
local JC_BICi F81_BICi uf_models_flag
JC_BICi=$1
F81_BICi=$2
#if [[ $(echo "$JC_BICi - $F81_BICi" | bc -l | cut -d. -f1) -gt "$delta_BIC_cutoff" ]]; then
if [[ "$(echo "if (${JC_BICi} - ${F81_BICi} >= ${delta_BIC_cutoff}) 1" | bc)" -eq 1 ]]; then
uf_models_flag=1
else
uf_models_flag=0
fi
echo "$uf_models_flag"
}
#-----------------------------------------------------------------------------------------
function check_transitional_heterogeneity()
{
# returns 1|0 if there is or not significant transitional_heterogeneity based on delta_AIC > 2
local HKY85_BICi TN93_BICi ti_models_flag
HKY85_BICi=$1
TN93_BICi=$2
ti_models_flag=''
#if [[ $(echo "if (${HKY85_BICi - $TN93_BICi" | bc | cut -d. -f1) -gt "$delta_BIC_cutoff" ]]; then
if [[ "$(echo "if (${HKY85_BICi} - ${TN93_BICi} >= ${delta_BIC_cutoff}) 1" | bc)" -eq 1 ]]; then
ti_models_flag=1
else
ti_models_flag=0
fi
echo "$ti_models_flag"
}
#-----------------------------------------------------------------------------------------
function check_pInv()
{
# returns 1|0 if there are or not a significant proportion of invariant sites
# based on delta_AIC > 2
local HKY85I_BICi HKY85IG_BICi pInv_flag
HKY85I_BICi=$1
HKY85IG_BICi=$2
#if [[ $(echo "$HKY85I_BICi - $HKY85IG_BICi" | bc -l | cut -d. -f1) -gt "$delta_BIC_cutoff" ]]; then
if [[ "$(echo "if (${HKY85I_BICi} - ${HKY85IG_BICi} >= ${delta_BIC_cutoff}) 1" | bc)" -eq 1 ]]; then
pInv_flag=1
else
pInv_flag=0
fi
echo "$pInv_flag"
}
#-----------------------------------------------------------------------------------------
function print_model_details()
{
cat <<EoH
1 -> standard models (JC69 K80 F81 HKY85 TN93 GTR)
2 -> 1 + extended_ef_models and extended_uf_models
the corresponding model subset being automatically selected
EoH
echo "# number of models in extended_models_ef = ${#extended_models_ef[@]}"
for m in "${!extended_models_ef[@]}"; do
echo "$m => ${extended_models_ef[$m]}"
done
echo '--------------------------'
echo "# number of models in extended_models_uf = ${#extended_models_uf[@]}"
for m in "${!extended_models_uf[@]}"; do
echo "$m => ${extended_models_uf[$m]}"
done
exit 0
}
#-----------------------------------------------------------------------------------------
function print_end_message()
{
cat <<EOF
========================================================================================
If you use $progname v.$version for your research,
I would appreciate that you:
1. Cite the code in your work as:
Pablo Vinuesa. $progname v.$version
https://github.com/vinuesa/TIB-filoinfo/blob/master/$progname
2. Give it a like on the https://github.com/vinuesa/TIB-filoinfo/ repo
Thanks!
EOF
}
#-----------------------------------------------------------------------------------------
function print_help(){
bash_vers=$(check_bash_version "$min_bash_vers")
bash_ge_5=$(awk -v bv="$bash_vers" 'BEGIN { if (bv >= 5.0){print 1}else{print 0} }')
if ((bash_ge_5 > 0)); then
cat <<EoH
$progname v${version} requires two arguments provided on the command line, the third one being optional:
$progname <string [aligned DNA sequences in PHYLIP format)> <int [model sets:1-3]> <int [no_rdm_starts; default:$n_starts]>
# model sets to choose from:
1 -> standard models (JC69 K80 F81 HKY85 TN93 GTR)
2 -> standard + ${#extended_models_ef[@]} extended_ef_models, OR
standard + ${#extended_models_uf[@]} extended_uf_models
NOTE: $progname automatically chooses the proper extended set (ef|uf) to evaluate,
based on delta_BIC evaluation of compositional bias (JC69 vs. F81)
3 -> minimal test set (JC69 F81 HKY85 TN93)
EXAMPLE: $progname primates.phy 2
AIM: $progname v${version} will evaluate the fit of the selected model set,
combined or not with +G and/or +f, computing AICi, BICi, deltaBIC and BICw,
inferring the ML tree under the BIC-selected model.
PROCEDURE:
- Models are fitted using a fixed BioNJ-JC tree, optimizing branch lengths and rates
to calculate their AICi, BICi, delta_BIC and BICw
- Only relevant models among the extended set are evaluated, based on delta_BIC
comparisons between JC69-F81, to decide if ef|uf models should be evaluated,
and comparisons between KHY85-TN93 to determine if models with two Ti rates should
be evaluated
- pInv is automatically excluded in the extended model set,
if the delta_BICi_HKY+G is =< $delta_BIC_cutoff when compared with the BIC_HKY+G+I
- The best model is selected by BIC
- SPR searches can be launched starting from multiple random trees
- Default single seed tree searches use a BioNJ tree with BEST moves
SOURCE: the latest version of the program is available on GitHub:
https://github.com/vinuesa/TIB-filoinfo
LICENSE: GPL v3.0. See https://github.com/vinuesa/get_phylomarkers/blob/master/LICENSE
CITATION: Vinuesa P. (2023). Efficient two-step selection of DNA models with phyml_DNAmodelFinder.
https://github.com/vinuesa/TIB-filoinfo/blob/master/phyml_DNAmodelFinder.sh
EoH
else
cat <<EoH
$progname v${version} requires two arguments provided on the command line, the third one being optional:
$progname <string [input phylip file (aligned DNA sequences)> <int [model sets:1|3]> <int [no_rdm_starts; default:$n_starts]>
# model sets to choose from:
1 -> (JC69 K80 F81 HKY85 TN93 GTR)
2 -> WILL NOT RUN properly on Bash < v5.0, sorry (see NOTE below)
3 -> minimal test set (JC K80 F81 HKY85 TN93)
AIM: $progname v${version} will evaluate the fit of the selected model set,
combined or not with +G and/or +f, computing AICi, BICi, deltaBIC and BICw,
inferring the ML tree under the BIC-selected model
PROCEDURE
- Models are fitted using a fixed NJ-JC tree, optimizing branch lenghts and rates
to calculate their AICi, BICi, delta_BIC and BICw
- The best model is selected by BIC
- SPR searches can be launched starting from multiple random trees
- Default single seed tree searches use a BioNJ with BEST moves
SOURCE: the latest version of the program is available on GitHub:
https://github.com/vinuesa/TIB-filoinfo
LICENSE: GPL v3.0. See https://github.com/vinuesa/get_phylomarkers/blob/master/LICENSE
NOTE: you are running the old Bash version $bash_vers.
Update to version >=5.0 to profit from the full set of models
and features implemented in $progname
EoH
fi
exit 0
}
#-----------------------------------------------------------------------------------------
#============================= END FUNCTION DEFINITIONS ==================================
#=========================================================================================
# ============ #
# >>> MAIN <<<
# ------------ #
## Check environment
# 0. Check that the input file was provided, and that the host runs bash >= v4.3
(( $# < 2 )) || (( $# > 3 )) && print_help
infile="$1"
model_set="$2"
n_starts="${3:-1}"
wkd=$(pwd)
phymlv=$(check_phyml_version "$min_phyml_version")
phymlyr=$(echo "$phymlv" | cut -d_ -f2)
check_is_phylip "$infile"
# OK, ready to start the analysis ...
start_time=$SECONDS
echo "========================================================================================="
bash_vers=$(check_bash_version "$min_bash_vers")
awk -v bv="$bash_vers" -v mb="$min_bash_vers" \
'BEGIN { if (bv < mb){print "FATAL: you are running acient bash v"bv, "and version >=", mb, "is required"; exit 1}else{print "# Bash version", bv, "OK"} }'
echo "# running with phyml v.${phymlv}"
((phymlyr < 2022)) && printf '%s\n%s\n' "# Warning: running old PhyML version from $phymlyr!" " Update to the latest one, using the phyml's GitHub repo: https://github.com/stephaneguindon/phyml/releases"
echo -n "# $progname v$version running on $host. Run started on: "; printf '%(%F at %T)T\n' '-1'
echo "# workding directory: $wkd"
check_dependencies
echo "# infile:$infile; model_set:$model_set => ${model_options[$model_set]}; seed trees: $n_starts; delta_BIC_cutoff=$delta_BIC_cutoff"
echo "========================================================================================="
echo ''
# 1. get sequence stats
print_start_time
echo " # 1. Computing sequence stats for ${infile}:"
no_seq=$(awk 'NR == 1{print $1}' "$infile")
echo "- number of sequences: $no_seq"
no_sites=$(awk 'NR == 1{print $2}' "$infile")
echo "- number of sites: $no_sites"
no_branches=$((2 * no_seq - 3))
echo "- number of branches: $no_branches"
echo "- observed nucleotide frequencies:"
compute_nt_freq_in_phylip "$infile"
echo '--------------------------------------------------------------------------------'
echo ''
# 2. set the selected model set, making a copy of the set array into the models array
case "$model_set" in
1) models=( "${standard_models[@]}" ) ;;
2) models=( "${test_models[@]}" ) ;; # plus automatic selection of extended_models to evaluate
3) models=( "${test_models[@]}" ) ;;
*) echo "unknown model set!" && print_help ;;
esac
# 3. Compute a fast NJ tree estimating distances with the JC model
print_start_time
echo "1. Computing NJ-JC tree for input file $infile with $no_seq sequences"
echo '--------------------------------------------------------------------------------'
phyml -i "$infile" -d nt -m JC69 -c 1 -b 0 -o n &> /dev/null
# 4. rename the outfile for future use as usertree
if [[ -s "${infile}"_phyml_tree.txt ]]; then
mv "${infile}"_phyml_tree.txt "${infile}"_JC-NJ.nwk
else
echo "FATAL ERROR: could not compute ${infile}_phyml_tree.txt (${infile}_JC-NJ.nw)" && exit 1
fi
# 5.1 run a for loop to combine all base models with (or not) +G, +I +G+I
# and fill the model_scores and model_cmds hashes
echo "2.1. running in a for loop to combine all base models in model_set ${model_set}=>${model_options[$model_set]},
with (or without) +G and or +I, and compute the model lnL, after optimizing branch lengths and rates"
# globals for compositional_heterogeneity check
declare -A seen
seen["HKY85+G"]=0
JC_BICi=0
F81_BICi=0
TN93_BICi=0
HKY85_BICi=0
HKY85I_BICi=0
compositional_heterogeneity=''
transitional_heterogeneity=''
use_pInv=''
freq_cmd=''
# loop to test standard models, with or without +G, +I, +G+I;
# Note the use of -f m to estimate base frequencies under ML. PhyML takes care of using or not -f m, given the standard model name
for mat in "${models[@]}"; do
print_start_time && echo "# running: phyml -i $infile -d nt -m $mat -f m -u ${infile}_JC-NJ.nwk -c 1 -v 0 -o lr"
((phymlyr >= 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -u "${infile}"_JC-NJ.nwk -c 1 -o lr --leave_duplicates --no_memory_check &> /dev/null
((phymlyr < 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -u "${infile}"_JC-NJ.nwk -c 1 -o lr --no_memory_check &> /dev/null
extra_params=0
total_params=$((no_branches + extra_params + ${model_free_params[$mat]}))
sites_by_K=$(echo 'scale=2;'"$no_sites/$total_params" | bc -l)
score=$(awk '/Log-l/{print $NF}' "${infile}"_phyml_stats.txt)
AICi=$(compute_AICi "$score" "$total_params")
AICc=$(compute_AICc "$score" "$total_params" "$no_sites" "$AICi")
BICi=$(compute_BIC "$score" "$total_params" "$no_sites")
printf -v model_string "%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" "$total_params" "$sites_by_K" "$score" "$AICi" "$AICc" "$BICi"
model_scores["${mat}"]="$model_string"
model_cmds["${mat}"]="$mat"
# save the JC_BICi to test for significant compositional heterogeneity (JC vs F81)
# and transitional bias between purines and pyrimidines (HKY vs TN93)
if ((model_set == 2)); then
if [[ "$mat" == 'JC69' ]]; then
JC_BICi="$BICi"
echo "JC_BICi: $JC_BICi"
elif [[ "$mat" == 'F81' ]]; then
F81_BICi="$BICi"
echo "F81_BICi: $F81_BICi"
elif [[ "$mat" == 'HKY85' ]]; then
HKY85_BICi="$BICi"
echo "HKY85_BICi: $HKY85_BICi"
elif [[ "$mat" == 'TN93' ]]; then
TN93_BICi="$BICi"
echo "TN93_BICi: $TN93_BICi"
fi
fi
print_start_time && echo "# running: phyml -i $infile -d nt -m $mat -f m -c 4 -a e -u ${infile}_JC-NJ.nwk -o lr"
((phymlyr >= 2022)) && phyml -i "$infile" -d nt -m "${mat}" -f m -c 4 -a e -u "${infile}"_JC-NJ.nwk -o lr --leave_duplicates --no_memory_check &> /dev/null
((phymlyr < 2022)) && phyml -i "$infile" -d nt -m "${mat}" -f m -c 4 -a e -u "${infile}"_JC-NJ.nwk -o lr --no_memory_check &> /dev/null
extra_params=1
total_params=$((no_branches + extra_params + ${model_free_params[$mat]}))
sites_by_K=$(echo 'scale=2;'"$no_sites/$total_params" | bc -l)
score=$(awk '/Log-l/{print $NF}' "${infile}"_phyml_stats.txt)
AICi=$(compute_AICi "$score" "$total_params")
AICc=$(compute_AICc "$score" "$total_params" "$no_sites" "$AICi")
BICi=$(compute_BIC "$score" "$total_params" "$no_sites")
printf -v model_string "%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" "$total_params" "$sites_by_K" "$score" "$AICi" "$AICc" "$BICi"
model_scores["${mat}+G"]="$model_string"
model_cmds["${mat}+G"]="$mat -c 4 -a e"
if ((model_set == 2)); then
if [[ $mat == 'HKY85' ]] && [[ -n ${model_cmds["${mat}+G"]} ]] && [[ ${seen["${mat}+G"]} -eq 0 ]]; then
HKY85G_BICi="$AICi"
echo "HKY85+G_BICi: $HKY85G_BICi"
seen['HKY85+G']=1
fi
fi
print_start_time && echo "# running: phyml -i $infile -d nt -m $mat -f m -v e -c 1 -u ${infile}_JC-NJ.nwk -o lr"
((phymlyr >= 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -v e -c 1 -u "${infile}"_JC-NJ.nwk -o lr --leave_duplicates --no_memory_check &> /dev/null
((phymlyr < 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -v e -c 1 -u "${infile}"_JC-NJ.nwk -o lr --no_memory_check &> /dev/null
extra_params=1 # 1 pInv
total_params=$((no_branches + extra_params + ${model_free_params[$mat]}))
sites_by_K=$(echo 'scale=2;'"$no_sites/$total_params" | bc -l)
score=$(awk '/Log-l/{print $NF}' "${infile}"_phyml_stats.txt)
AICi=$(compute_AICi "$score" "$total_params")
AICc=$(compute_AICc "$score" "$total_params" "$no_sites" "$AICi")
BICi=$(compute_BIC "$score" "$total_params" "$no_sites")
printf -v model_string "%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" "$total_params" "$sites_by_K" "$score" "$AICi" "$AICc" "$BICi"
model_scores["${mat}+I"]="$model_string"
model_cmds["${mat}+I"]="$mat -v e"
print_start_time && echo "# running: phyml -i $infile -d nt -m $mat -f m -u ${infile}_JC-NJ.nwk -v e -a e -o lr"
((phymlyr >= 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -u "${infile}"_JC-NJ.nwk -v e -a e -c 4 -o lr --leave_duplicates --no_memory_check &> /dev/null
((phymlyr < 2022)) && phyml -i "$infile" -d nt -m "$mat" -f m -u "${infile}"_JC-NJ.nwk -v e -a e -c 4 -o lr --no_memory_check &> /dev/null
extra_params=2 #19 from AA frequencies + 1 gamma
total_params=$((no_branches + extra_params + ${model_free_params[$mat]}))
sites_by_K=$(echo 'scale=2;'"$no_sites/$total_params" | bc -l)
score=$(awk '/Log-l/{print $NF}' "${infile}"_phyml_stats.txt)
AICi=$(compute_AICi "$score" "$total_params")
AICc=$(compute_AICc "$score" "$total_params" "$no_sites" "$AICi")
BICi=$(compute_BIC "$score" "$total_params" "$no_sites")
printf -v model_string "%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f" "$total_params" "$sites_by_K" "$score" "$AICi" "$AICc" "$BICi"
model_scores["${mat}+I+G"]="$model_string"
model_cmds["${mat}+I+G"]="$mat -v e -c 4 -a e"
if ((model_set == 2)); then
if [[ $mat == 'HKY85' ]] && [[ -n ${model_cmds["${mat}+I+G"]} ]] && [[ ${seen["${mat}+G"]} -eq 1 ]]; then
HKY85IG_BICi="$AICi"
echo "HKY85+I+G_BICi: $HKY85IG_BICi"
fi
fi
done
# cleanup: remove phyml output files from the last pass through the loop
[[ -s "${infile}"_phyml_stats.txt ]] && rm "${infile}"_phyml_stats.txt
[[ -s "${infile}"_phyml_tree.txt ]] && rm "${infile}"_phyml_tree.txt
##### 5.2 if extended set, then compute and set the compositional_heterogeneity flag
# and loop over corresponding set of extended models
# check_compositional_heterogeneity and set compositional_heterogeneity flag (1|0), accordingly
if ((model_set == 2)); then
if [[ -n "$JC_BICi" ]] && [[ -n "$F81_BICi" ]]; then
compositional_heterogeneity=$(check_compositional_heterogeneity "$JC_BICi" "$F81_BICi")
echo '--------------------------------------------------------------------------------'
print_start_time
echo '# Starting evaluation and automatic selection of extended model set'
echo "# setting compositional_heterogeneity flag to: $compositional_heterogeneity"
fi
if [[ -n "$HKY85_BICi" ]] && [[ -n "$TN93_BICi" ]]; then
transitional_heterogeneity=$(check_transitional_heterogeneity "$HKY85_BICi" "$TN93_BICi")