#!/usr/local/bin/perl5 -w # target98-with-param # builds an alignment of probable homologs of an initial seed sequence # or alignment, with homologs chosen from NRP. # # This script is a parameterized version of the target98 script # It behaves similarly to the old build-target98 script when # default parameters are used, but is not quite plug-compatible # ("build-target98 a b.a2m" is equivalent to # "target98-with-param -seed a -out b"). # # This script takes one required and several optional arguments: # REQUIRED: # -seed file.a2m a guide sequence (or alignment) # -out root creates root.a2m for output # OPTIONAL: # -iter 4 number of iterations # -all creates root_1.a2m, root_2.a2m, ... # for the multiple iterations # -thresholds -40,-30,-24,-16 specifies the NLL-null # thresholds to use for each iteration # -reverse_diff 7 how much looser is the # reversed-sequence threshold # -viterbi_diff 4 how much looser is the Viterbi threshold # -blast_e 500 how far out are remote homologs looked for # -blast_close -10.5 LN(P_value) for close homologs # -build varh50 how are search models built from alignments # -weight_build if selected use weights (currently Henikoff # weights at 0.5 bits/column) for buildmodel. # -weight_dist if selected, then use weights proportional # to log P(s|M)/P(s|null) for training. # -homolog file containing homologs of the guide # sequence. If this parameter is not specified # the default action is to extract close homologs # from NRP using blast. # -close file containing close homologs of the guide # sequence. These close homologs are used to # train the model on the first training # iteration. If this parameter is not specified, # but -homolog is specified, then the file from # the -homolog option is used for the close # homologs. If neither -close nor -homolog are # specified, then the default is obtain both # from NRP using blast, but -close uses a # much tighter threshold. # -no_homologs use only the seed a2m for # homologs--don't look for more. # -force_seed 1 set (default) to force all of seed # into the training set. # -jump 1.0 probability associated with jump-in and # jump-out on local alignment # -fimstrength 1.0 probability associated with loops on fims # -new_sam Look for sam in "new" directory # -seed_constraints cst constraints file associated with seed # alignment. This will cause root.cst to be # created (see -out). # -db nr Which non-redundant database to use # (understands nr and nrp, otherwise needs # full path--default now nr). # -nsurgery 0 number of surgery steps to allow on # each iteration (default 0) # -mainline_cutoff 0.5 mainline_cutoff in buildmodel for surgery # NOT IMPLEMENTED YET: # -trans file Specify transition regularizer # # WARNING: the defaults may change---if they do, a comment will be added # explaining what parameter setting comes closest to the original # script. # # DIFFERENCES FROM ORIGINAL: # 1) No noise in training with buildmodel--currently no parameter # setting will give the original behavior. This should be a minor # change that doesn't affect behavior much. # (Oops, noise changed ot ALMOST zero, since stopcriterion bug # if zero noise.) # 2) -insconf increased to 30000 in buildmodel, since some # alignments quite large. Should have minor effect, so no parameter # setting to get original behavior. # 3) eliminated multdomain, using hmmscore instead---this may affect # exactly which sequences are aligned in the $base-b alignment, but # again, the effect is probably small, and so no way to get # original behavior is provided. # 4) Changed motif_cutoff from 0.4 to min(40/length, 0.6). This is # expected to get more homologs on long proteins, and fewer on # short ones. No way to get original behavior. # 5) Added -fimstrength -jump_in_prob and -jump_out_prob to reduce # over-eager FIMS. This may have significant effect, and # needs to be tested! # # WARNING: This script is designed to use new features of SAM version 2.1 # (Feb 1998). It will NOT work with earlier versions, though they may # not give error messages. # use FileHandle; STDOUT->autoflush(1); STDERR->autoflush(1); $prog=$0; # Set up defaults # thresholds for NLL-null @thresholds = (-40, -30, -24, -16); # adjustments to threshold for scoring against reversed sequence # and for scoring with Viterbi path (both of which give more # positive costs). $reverse_diff = 7; $viterbi_diff = 4; $blast_e = 500; $blast_close = -10.5; $build = "varh50"; $jump = 1.0; $fimstrength= 1.0; $save_all = 0; $num_iter=4; $HOMO = ""; $CLOSE = ""; $use_constraints = 0; $no_homologs = 0; $force_seed = 1; $new_sam=0; $weighted_build = 0; $weight_by_distance = 0; # which non-redundant protein database to search $NRP = "/projects/compbio/data/nrp/NRP_ALL.PDB"; $NR = "/projects/compbio/data/nrp/nr"; $which_nrp = $NR; $nsurgery=0; $mainline_cutoff=0.5; &process_command_line; # Make sure that search path includes the compbio bin directories # Has to be done AFTER process_command_line, since use_constraints # needs to be set correctly to determine the right hmmscore. &set_paths; # Do the rm on alpha (which owns the scratch directory) if needed # to avoid problems with ultrasparc/alpha incompatibility. $rsh_alpha = ($arch eq "alpha")? "": "rsh alpha "; $WORKDIR = "/projects/compbio4/experiments/workspace/target98-$$"; make_directories($WORKDIR); $BASE_A2M = ($seed_a2m =~ /(\S+)[.]gz$/ )? $1 : $seed_a2m; chdir $WORKDIR; if (-e $BASE_A2M) { system "cp $BASE_A2M init.a2m"; } elsif (-e "$BASE_A2M.gz") { system "zcat $BASE_A2M.gz > init.a2m"; } else { die "$prog error: can't find $BASE_A2M or $BASE_A2M.gz\n"; } if ($use_constraints) { $BASE_CST = ($seed_constraints =~ /(\S+)[.]gz$/ )? $1 : $seed_constraints; if (-e $BASE_CST) { system "cp $BASE_CST init.cst"; } elsif (-e "$BASE_CST.gz") { system "zcat $BASE_CST.gz > init.cst"; } else { die "$prog error: can't find $BASE_CST or $BASE_CST.gz\n"; } } system("checkseq foo -db init.a2m > init.check")==0 || die "$prog error: Checkseq failed on initial alignment\n"; open(CHECK, ") { if (/^Alignment:\s+no/) { die "$prog error: $BASE_A2M not recognized as an alignment\n"; } if (/^AlignColumns:\s+(\d+)/) { $length = $1; if ($length <= 1) { die "$prog error: $BASE_A2M has $length alignment columns\n" . "A2M format requires upper-case for aligned residues\n"; } } }; close CHECK; if ($HOMO eq "" && ! $no_homologs) { $blast_dir = "blast-$$"; make_directories($blast_dir); chdir $blast_dir; # Should redo build_query files to build one file, do query, # accumulate results, and continue system("uniqueseq init -db ../init.a2m -a protein")==0 || die "uniqueseq of initial alignment failed\n"; build_query_files("init.seq"); foreach $query (@queries) { $blast_out = "$query-blast.out"; blast_query_to_database("wu-blastp", $query, $which_nrp, $blast_out); accumulate_blast_hits($blast_out); unlink $query; unlink $blast_out; } chdir ".."; system("$rsh_alpha rm -rf $blast_dir < /dev/null")==0 || die "$prog error: $rsh_alpha failed\n"; # HERE WE EXTRACT homolog sequences fron $which_nrp and put them # into a file called $HOMO $HOMO = "found-by-wu-blast"; $close_from_blast = ($CLOSE eq ""); $CLOSE = "close-homologs-by-blast" if $close_from_blast; open(NRP_IN, "< $which_nrp") || die "Can't read $which_nrp\n"; open(HOMO_FILE, ">$HOMO") || die "Can't open $HOMO for writing\n"; if ($close_from_blast) { open(CLOSE_HOMO, ">$CLOSE") || die "Can't open $CLOSE_HOMO for writing\n"; } $copying=0; $copying_close= 0; while() { if (/^>\s*(\S+)[, ]?/) { # new id $copying = defined $blast_hits{$1}; $copying_close = $copying && ($blast_hits{$1} <= $blast_close); if ($copying) { # print STDERR "copying $1\n"; $blast_hits{$1} = "copied"; } } print HOMO_FILE $_ if $copying; print CLOSE_HOMO $_ if $copying_close && $close_from_blast; } close HOMO_FILE; close CLOSE_HOMO if $close_from_blast; foreach $missing (keys(%blast_hits)) { if ($blast_hits{$missing} ne "copied") { print STDERR "Error: $prog couldn't find $missing in $which_nrp\n"; } } } $LIB = "/projects/compbio/lib"; $MIXTURE = "$LIB/recode2.20comp"; $BL29 = "$LIB/bl29.regularizer"; $LONG = "$LIB/long_match.regularizer"; $OPT = "$LIB/fssp-trained.regularizer"; $BUILDPARAM= "-prior_library $MIXTURE \\\ -modellength 0 \\\ -internal_weight 0 \\\ -initial_noise 0.0001 -anneal_noise 0.000001 \\\ -anneal_length 2 \\\ -many_files 1 \\\ -insconf 30000 -Nseq 20000\\\ -binary_output 1 \\\ -nsurgery $nsurgery -mainline_cutoff $mainline_cutoff \\\ -Nmodels 1 \\\ -fimstrength $fimstrength \\\ -sw 2 -jump_in_prob $jump -jump_out_prob $jump"; # CHANGE: 14 Feb 1998, removed noise from training # -initial_noise 0.5 -anneal_noise 0.01 \\\ # -anneal_length 5 \\\ # NOTE: as of 5 Dec 1997, buildmodel ignores -sw 2. # But now it uses it (July 1998) $motif_cutoff = 40 / $length; # normally require 40 residues in homolog $motif_cutoff = 0.6 if $motif_cutoff >0.6; $HMMSCOREPARAM="-sw 2 -viterbi 0 -subtract_null 4 -simplethreshold 1 \\\ -select_seq 3 -select_score 8 -sort 2 \\\ -select_mdalign 3 -Motifcutoff $motif_cutoff \\\ -fimstrength $fimstrength \\\ -jump_in_prob $jump -jump_out_prob $jump"; # Build initial model using only the input sequence # This model is not as good as I'd like---I need to develop a better # model-building method for use with single sequences. # select from the CLOSE homologs training set, # threshold set VERY conservatively for no false positive on chothia-test system("cp init.a2m m0.a2m"); if ($use_constraints) { system("cp init.cst m0.cst"); } $name = "m0"; for ($iter=1; $iter<=$num_iter; $iter++) { $possible = ($iter==1? $CLOSE: $HOMO); # Note: it would probably be best to find one transition regularizer # and use it consitently $reg= ($iter==$num_iter? $OPT: ($iter==1? $BL29: $LONG)); $prev=$name; $name = ($save_all? $root : "m") ."_" . $iter; &select_and_train("$prev.a2m", "$prev.cst", "$name.a2m", "$name.cst", "tmp_$iter", $possible, $thresholds[$iter-1], $reg); } # Have to do a copy rather than a rename for the final alignment, # because it may be across different file systems. if (! $save_all) { $dest_a2m = $root . ".a2m"; system ("cp $name.a2m $dest_a2m") ==0 || die "Copying final alignment to $dest_a2m failed.\n"; $dest_cst = $root . ".cst"; if ($use_constraints) { system ("cp $name.cst $dest_cst") ==0 || die "Copying final constraints to $dest_cst failed.\n"; } } system "$rsh_alpha rm -rf $WORKDIR < /dev/null"; exit 0; #-------------------------------------------------- # subroutines # parse arguments sub process_command_line { local($i) = 0; while ($i <= $#ARGV) { $_ = $ARGV[$i++]; if (/^-seed$/) { $seed_a2m = absolute( $ARGV[$i++]); (-e $seed_a2m) || die "$seed_a2m doesn't exist.\n"; } elsif (/^-out/) { $root = absolute($ARGV[$i++]); } elsif (/^-iter/){ $num_iter = $ARGV[$i++]; } elsif (/^-jump/){ $jump = $ARGV[$i++]; } elsif (/^-fimstrength/) { $fimstrength = $ARGV[$i++]; } elsif (/^-all/) { $save_all = 1; } elsif (/^-build/) { $build = $ARGV[$i++]; } elsif (/^-blast_e/) { $blast_e = $ARGV[$i++]; } elsif (/^-blast_close/) { $blast_close = $ARGV[$i++]; } elsif (/^-reverse_diff/) { $reverse_diff = $ARGV[$i++]; } elsif (/^-viterbi_diff/) { $viterbi_diff = $ARGV[$i++]; } elsif (/^-thresholds/) { $threshold_spec= $ARGV[$i++]; @thresholds = split(/,/ , $threshold_spec); } elsif (/^-weight_dist/) { $weight_by_distance = 1; $weighted_build = 1; } elsif (/^-weight_build/){ $weighted_build = 1; } elsif (/^-homolog/) { $HOMO = absolute($ARGV[$i++]); (-e $HOMO) || die "$HOMO doesn't exist.\n"; } elsif (/^-close/) { $CLOSE = absolute($ARGV[$i++]); (-e $CLOSE) || die "$CLOSE doesn't exist.\n"; } elsif (/^-no_homologs/) { $no_homologs = 1; } elsif (/^-force_seed/) { $force_seed = $ARGV[$i++]; } elsif (/^-nsurgery/) { $nsurgery = $ARGV[$i++]; } elsif (/^-mainline_cutoff/){ $mainline_cutoff = $ARGV[$i++]; } elsif (/^-new_sam/) { $new_sam=1; } elsif (/^-seed_constraints/) { $seed_constraints = absolute($ARGV[$i++]); (-e $seed_constraints) || die "$seed_constraints doesn't exist.\n"; $use_constraints = 1; } elsif (/^-db/) { $which_nrp = $ARGV[$i++]; if ($which_nrp eq "nrp") {$which_nrp=$NRP;} elsif ($which_nrp eq "nr") {$which_nrp=$NR;} else {$which_nrp = absolute($which_nrp);} } else { die "Parameter '$ARGV[$i-1]' not understood by $prog\n"; } } # Make certain that required parameters were provided. defined($seed_a2m) || die "Must specify '-seed file.a2m' with $prog\n"; defined($root) || die "Must specify '-out root' with $prog\n"; if ($HOMO eq "") { (-e $which_nrp) || die " Can't find database file $which_nrp\n"; } # 10000 is maximum permissible value for E $blast_e = ($blast_e> 10000)? 10000 : $blast_e; # verify that $num_iter>0 if ($num_iter <=0) { print STDERR "Warning: -iter specifed with number <=0, using 1\n"; $num_iter = 1; } # Pad @thresholds if it doesn't have the right number of thresholds $num_thresh = $#thresholds +1; if ($num_thresh <$num_iter) { print STDERR "Warning: # thresholds ($num_thresh) < # iterations ($num_iter)\n"; } for ($i=0; $i<=$#thresholds; $i++) { if ($thresholds[$i] >0 ) { print STDERR "$prog: Thresholds must be <=0, not $thresholds[$i]\n"; @thresholds= @thresholds[0..$i-1]; } } if ($#thresholds==-1) { push @thresholds, -40; } while ($#thresholds < $num_iter-1) { push @thresholds, ($thresholds[$#thresholds] * 0.75); } # Echo the parameters to stderr print STDERR "$prog: Thresholds used are $thresholds[0]"; for ($i=1; $i<$num_iter; $i++) { print STDERR ", $thresholds[$i]"; } print STDERR "\n"; print STDERR "$prog: reverse_diff= $reverse_diff, viterbi_diff= $viterbi_diff\n"; print STDERR "$prog: build= $build\n"; if ($HOMO eq "") { print STDERR "$prog: blast_e= $blast_e, blast_close= $blast_close\n"; } else { $CLOSE = $HOMO if ($CLOSE eq ""); print STDERR "$prog: homologs=$HOMO close=$CLOSE\n"; } return; } # run through uniqueseq to remove exact duplicates sub remove_dup_seq($) { my ($file) = @_; system("uniqueseq foo -db $file -a protein")==0 || die "uniqueseq failed\n"; system("mv foo.seq $file")==0 || die "mv failed\n"; } # # accumulate_blast_hits(blast_output) # reads the blast_output file and collects ids of hits # adding them to hash %blast_hits # sub accumulate_blast_hits($) { my($blast_out_file) = @_; open(BLASTOUT_IN, "<$blast_out_file") || die "Error: can't open file $blast_out_file\n"; # # seek Query line # my($blastout_in_line); my($query_id); $blastout_in_line = ; while ($blastout_in_line and $blastout_in_line !~ /^Query=/) { $blastout_in_line = ; } if ($blastout_in_line and $blastout_in_line =~ /^Query=\s+(\S+)/) { $query_id = $1; } else { print STDERR "Error: file $blast_out_file has no Query line\n"; return; } # # seek to summary of hits to query # my($hits_table_hdr); $hits_table_hdr = "Sequences producing High-scoring Segment Pairs:"; $blastout_in_line = ; while ($blastout_in_line and $blastout_in_line !~ /^$hits_table_hdr/) { $blastout_in_line = ; } # # skip header and the blank line following it # my($hits_table_line_pat); $hits_table_line_pat = '^(\S+)\s+.*(\d+)\s+(\S+)\s+(\d+)'; # $1 $2 $3 $4 # id comment score P(N) N $blastout_in_line = ; $blastout_in_line = ; while ($blastout_in_line and $blastout_in_line =~ /$hits_table_line_pat/) { my($hitseq_id, $high_score, $pvalue, $nfrag, $log_p); $hitseq_id = $1; $pvalue = $3; $log_p = $pvalue<=0? -1300: log($pvalue); $blast_hits{$hitseq_id} = $log_p; # print STDERR "$hitseq_id P(N)=$pvalue => $log_p\n"; $blastout_in_line = ; } } # # sub blast_query_to_database($blast_prog, # $query_file, $database_file, # $blast_out_file); # sub blast_query_to_database($$$$) { my($blast_prog, $query_file, $database_file, $blast_out_file) = @_; # report up to 10000 sequences, but don't report any alignments. $blast_params = " E=$blast_e V=10000 B=0 -gi"; my($blast_cmd) = "$blast_prog $database_file $query_file" . " $blast_params > $blast_out_file"; print STDERR "\n$blast_cmd\n"; system($blast_cmd)==0 || die "$blast_cmd failed\n"; } # # sub build_query_files(database_file) # builds query files named 1.fasta, 2.fasta, ... # leaves names of files in @queries # sub build_query_files($$) { my($database_file) = @_; open(DB_IN, "<$database_file") || die "Error: can't open file $database_file\n"; my $query_count = 0; my($db_in_line); my($new_seq_pat); $new_seq_pat = '^>(\S+)(.*)'; $db_in_line = ; while ($db_in_line) { my($seq_id, $seq_info, $seq); ($db_in_line =~ /$new_seq_pat/o) || die "Error: bad line in file $database_file\n"; $seq_id = $1; $seq_info = $2; $db_in_line = ; $seq = ""; while ($db_in_line and $db_in_line !~ /$new_seq_pat/o) { $seq .= $db_in_line; chomp($seq); $db_in_line = ; } $query_count ++; my($query_out_file) = $query_count . ".fasta"; push @queries, $query_out_file; open(QUERY_OUT, ">$query_out_file") || die "Error: can't open file $query_out_file"; print QUERY_OUT ">$seq_id\n"; print QUERY_OUT "$seq\n"; close (QUERY_OUT); } close (DB_IN); } # prepend relative path names with current working directory # usage $absolute_file = absolute($file) sub absolute($) { my ($file) = @_; return $file if ($file =~ /^\/.*/); my $wd = `pwd`; chop($wd); if ($wd=~/\/auto(.*)/) {return "$1/$file"}; return "$wd/$file"; } # make all directories along path to specified directory sub make_directories ($) { my ($dir) = @_; @pieces = split /\//, $dir; for (my $i = 0; $i <=$#pieces; $i++) { my $path = join ('/', @pieces[0 .. $i]); next if $path eq ""; if (! -d $path) { mkdir $path, 0775; chmod 0775, $path; if (`groups` =~ /protein/ ) { system("chgrp protein $path")==0 || die "$prog:chgrp failed\n"; } } } } # Do one round of selection and training, starting and ending with an alignment # name of initial alignment # name of initial alignment's constraints file # name of final alignment # name of final alignment's constraints file # base name for temporaries # set of possible homologs to search # threshold to use # transition regularizer to use sub select_and_train($$$$$$$$) { my ($align_in, $constraints_in, $align_out, $constraints_out, $base, $homologs, $thresh, $reg) = @_; $reverse_thresh = $thresh + $reverse_diff; $vit_thresh = $thresh + $viterbi_diff; # build inital model $cmd = "$build $align_in $base-a.mod"; if ($use_constraints) { $cmd .= " $constraints_in"; } system($cmd)==0 || die "$build failed\n"; # select from possible homologs (using simple null and reverse sequence) # then reselect only the relevant parts # (but add some extra residues for context) $cmd = "hmmscore $base-a -i $base-a.mod $HMMSCOREPARAM \\\ -db init.a2m \\\ -NLL_NULL $thresh -NLLcomplex $reverse_thresh \\\ -align_short 50 -mdNLLNULL $vit_thresh"; if (!$no_homologs) { $cmd .= " -db $homologs"; } if ($use_constraints) { $cmd .= " -constraints $constraints_in"; $cmd .= " -constraints_out $base-a.cst"; } system($cmd) == 0 || die "hmmscore $base-a failed\n"; # $base-a.sel has the selected homologs # $base-a.mult has parts of the selected homologs to train on # $base-a.cst has the constraints for both the .mult sequences, plus # original links. # add the initial alignment to the training set, to make sure it doesn't # get lost along the way if ($force_seed) { system "cat init.a2m $base-a.mult > $base-a.train"; } else { rename ("$base-a.mult", "$base-a.train"); } if ($weighted_build) { if ($weight_by_distance) { system("cat $base-a.mstat $base-a.dist \\\ | dist-weights > $base-a.w")==0 || die "$prog error:dist-weights failed\n"; remove_dup_seq("$base-a.train"); } else { $weight_args = "HenikoffWeight 0.5 0.4"; # 0.5 bits/column (system ("/projects/compbio/lib/make-weights.pl \\\ $base-a.train \\\ $base-a.w $MIXTURE \"$weight_args\" 1.0 \\\ > $base-a.weight.log") ==0) || die "Error: building weight file for $base-a.train failed.\n"; } } else { # remove duplicates from the training set. # Do this only if no alignment-based weighting done, # since uniqueseq foolishly discards alignment, making weighting # impossible after uniqueseq remove_dup_seq("$base-a.train"); } $cmd = "buildmodel $base-b -insert $base-a.mod\\\ $BUILDPARAM -train $base-a.train \\\ -insert $reg" . ($weighted_build? " -sequence_weights $base-a.w": ""); if ($use_constraints) { $cmd .= " -constraints $base-a.cst" } system($cmd)==0 || die "buildmodel $base-b failed\n"; # Reselect interesting pieces out of the selection set. $cmd = "hmmscore $base-b -i $base-b.mod $HMMSCOREPARAM \\\ -db init.a2m \\\ -db $base-a.sel \\\ -NLL_NULL $thresh -NLLcomplex $reverse_thresh \\\ -align_short 5 -mdNLLNULL $vit_thresh"; if ($use_constraints) { $cmd .= " -constraints $base-a.cst \\\ -constraints_out $base-b.cst"; } system($cmd)==0|| die "hmmscore $base-b failed\n"; if ("$base-b.mult" ne $align_out) { system("cp -f $base-b.mult $align_out")==0 || die "Can't copy alignment to $align_out\n"; } if ($use_constraints && ("$base-b.cst" ne $constraints_out)) { system("cp -f $base-b.cst $constraints_out")==0 || die "Can't copy constraints to $constraints_out\n"; } if (-z $align_out) { print STDERR "Alignment $align_out empty, copying initial alignment\n"; system ("cp -f init.a2m $align_out")==0 || die "Couldn't copy init.a2m to $align_out\n"; } } sub set_paths { $scriptsdir = $ENV{SCRIPTS97} ? $ENV{SCRIPTS97} : "/projects/compbio/experiments/models.97/scripts"; $arch = `uname -m`; chomp ($arch); $ENV{PATH} = "."; $ENV{PATH} .= ":$scriptsdir"; # For now, a different version of SAM is being used for constraints. if ($use_constraints) { $ENV{PATH} = $ENV{PATH} . ":$scriptsdir/sam-cst/$arch"; } $ENV{PATH} .= ":/projects/compbio/bin"; if ($new_sam) { $ENV{PATH} .= ":/projects/compbio/bin/$arch/new"; } $ENV{PATH} .= ":/projects/compbio/bin/$arch"; $ENV{PATH} .= ":/projects/compbio/bin/hmmscripts"; $ENV{PATH} .= ":/projects/compbio/bin/scripts"; $ENV{PATH} .= ":/sbin:/usr/sbin:/bin:/usr/bin:/usr/local/gnu/bin:/usr/local/share/request/bin:/usr/local/bin"; print STDERR "PATH=$ENV{PATH}\n"; }