| line | bran | sub | pod | code | 
| 1 |  |  |  | package Sanger::CGP::AlleleCount::Genotype; | 
| 2 |  |  |  |  | 
| 3 |  |  |  | ##########LICENCE########## | 
| 4 |  |  |  | # Copyright (c) 2014-2018 Genome Research Ltd. | 
| 5 |  |  |  | # | 
| 6 |  |  |  | # Author: CASM/Cancer IT | 
| 7 |  |  |  | # | 
| 8 |  |  |  | # This file is part of alleleCount. | 
| 9 |  |  |  | # | 
| 10 |  |  |  | # alleleCount is free software: you can redistribute it and/or modify it under | 
| 11 |  |  |  | # the terms of the GNU Affero General Public License as published by the Free | 
| 12 |  |  |  | # Software Foundation; either version 3 of the License, or (at your option) any | 
| 13 |  |  |  | # later version. | 
| 14 |  |  |  | # | 
| 15 |  |  |  | # This program is distributed in the hope that it will be useful, but WITHOUT | 
| 16 |  |  |  | # ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS | 
| 17 |  |  |  | # FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more | 
| 18 |  |  |  | # details. | 
| 19 |  |  |  | # | 
| 20 |  |  |  | # You should have received a copy of the GNU Affero General Public License | 
| 21 |  |  |  | # along with this program. If not, see . | 
| 22 |  |  |  | ##########LICENCE########## | 
| 23 |  |  |  |  | 
| 24 |  | 2 |  | use strict; | 
| 25 |  |  |  |  | 
| 26 |  | 2 |  | use Carp; | 
| 27 |  | 2 |  | use English qw( -no_match_vars ); | 
| 28 |  | 2 |  | use warnings FATAL => 'all'; | 
| 29 |  |  |  |  | 
| 30 |  | 2 |  | use FindBin qw($Bin); | 
| 31 |  | 2 |  | use File::Which qw(which); | 
| 32 |  | 2 |  | use File::Temp qw(tempdir); | 
| 33 |  |  |  |  | 
| 34 |  | 2 |  | use Sanger::CGP::AlleleCount; | 
| 35 |  |  |  |  | 
| 36 |  | 2 |  | use Const::Fast qw(const); | 
| 37 |  |  |  |  | 
| 38 |  |  |  | const my $MIN_MAPQ => 35; | 
| 39 |  |  |  | const my $MIN_PBQ => 30; | 
| 40 |  |  |  | const my $FLAG_REQ => 2; | 
| 41 |  |  |  | const my $FLAG_FILT => 4+8+256+512+1024+2048; # 3852 | 
| 42 |  |  |  |  | 
| 43 |  |  |  | =item new | 
| 44 |  |  |  |  | 
| 45 |  |  |  | Null constructor | 
| 46 |  |  |  |  | 
| 47 |  |  |  | =cut | 
| 48 |  |  |  |  | 
| 49 |  |  |  | sub new { | 
| 50 |  | 3 | 1 | my ($class) = @_; | 
| 51 |  |  |  | my $self = { }; | 
| 52 |  |  |  | bless $self, $class; | 
| 53 |  |  |  | return $self; | 
| 54 |  |  |  | } | 
| 55 |  |  |  |  | 
| 56 |  |  |  | =item configure | 
| 57 |  |  |  |  | 
| 58 |  |  |  | Set up the object for the current analysis. | 
| 59 |  |  |  |  | 
| 60 |  |  |  | $genotype->configure('my.bam', $min_pbq, $min_mapq [, $fasta]) | 
| 61 |  |  |  |  | 
| 62 |  |  |  | =cut | 
| 63 |  |  |  |  | 
| 64 |  |  |  | sub configure { | 
| 65 |  | 12 | 1 | my ($self, $bam_file, $min_pbq, $min_mapq, $fasta) = @_; | 
| 66 | 50 |  |  | $self->{'$fasta'} = $fasta if(defined $fasta); | 
| 67 |  |  |  | $self->{'min_pbq'} = $min_pbq // $MIN_PBQ; | 
| 68 |  |  |  | $self->{'min_mapq'} = $min_mapq // $MIN_MAPQ; | 
| 69 |  |  |  | } | 
| 70 |  |  |  |  | 
| 71 |  |  |  | =item _wrap_c_alleleCounter | 
| 72 |  |  |  |  | 
| 73 |  |  |  | Generic function to generate allele counts from a chr\tpos\n formatted file (1-based) | 
| 74 |  |  |  | by calling the C version. | 
| 75 |  |  |  |  | 
| 76 |  |  |  | Calling function should provided an intermediate path for the output if data needs reformatting for | 
| 77 |  |  |  | SNP6 style loci input files. | 
| 78 |  |  |  |  | 
| 79 |  |  |  | =cut | 
| 80 |  |  |  |  | 
| 81 |  |  |  | sub _wrap_c_alleleCounter { | 
| 82 |  | 12 |  | my ($self, $hts_file, $out_file, $clean_loci) = @_; | 
| 83 |  |  |  | my $command = sprintf _alleleCounter_c(). | 
| 84 |  |  |  | ' --loci-file=%s'. | 
| 85 |  |  |  | ' --hts-file=%s'. | 
| 86 |  |  |  | ' --output-file=%s'. | 
| 87 |  |  |  | ' --min-base-qual=%d'. | 
| 88 |  |  |  | ' --min-map-qual=%d'. | 
| 89 |  |  |  | ' --required-flag=%d'. | 
| 90 |  |  |  | ' --filtered-flag=%d', | 
| 91 |  |  |  | ($clean_loci, $hts_file, $out_file, | 
| 92 |  |  |  | $self->{'min_pbq'}, $self->{'min_mapq'}, | 
| 93 |  |  |  | $FLAG_REQ, $FLAG_FILT); | 
| 94 | 50 |  |  | if(defined $self->{'$fasta'}) { | 
| 95 |  |  |  | $command .= ' --ref-file='.$self->{'$fasta'}; | 
| 96 |  |  |  | } | 
| 97 | 50 |  |  | if($ENV{ALLELE_C_SILENT}) { #  only used for test harness | 
| 98 |  |  |  | $command .= ' 2> /dev/null' | 
| 99 |  |  |  | } | 
| 100 | 50 |  |  | system($command) && die $!; | 
| 101 |  |  |  | return; | 
| 102 |  |  |  | } | 
| 103 |  |  |  |  | 
| 104 |  |  |  | =item get_full_snp6_profile | 
| 105 |  |  |  |  | 
| 106 |  |  |  | Writes tab seperated allelic counts and depth to specified FH | 
| 107 |  |  |  | Uses all snps defined in file used by ngs_cn (format slightly different) | 
| 108 |  |  |  |  | 
| 109 |  |  |  | =cut | 
| 110 |  |  |  | sub get_full_snp6_profile { | 
| 111 |  | 6 | 1 | my ($self, $bam_file, $out_file, $loci_file, $min_pbq, $min_mapq, $fasta) = @_; | 
| 112 |  |  |  | $self->configure($bam_file, $min_pbq, $min_mapq, $fasta); | 
| 113 |  |  |  |  | 
| 114 |  |  |  | my %stored; | 
| 115 |  |  |  | # process the original loci file | 
| 116 |  |  |  | my $tmpdir = tempdir( CLEANUP => 1 ); | 
| 117 |  |  |  | my $tmp_loci = $tmpdir.'/loci_tmp.out'; | 
| 118 |  |  |  | my $tmp_out = $tmpdir.'/out_tmp.out'; | 
| 119 | 50 |  |  | open my $N_LOC, '>', $tmp_loci or croak "Unable to open $tmp_loci for writing: $OS_ERROR\n"; | 
| 120 | 50 |  |  | open my $SNP6, '<', $loci_file or croak "Unable to open $loci_file for reading: $OS_ERROR\n"; | 
| 121 |  |  |  | while(my $line = <$SNP6>) { | 
| 122 |  |  |  | chomp $line; | 
| 123 |  |  |  | my ($chr, $pos, undef, undef, $allA, $allB) = split /\s/, $line; | 
| 124 |  |  |  | $stored{"$chr:$pos:A"} = uc $allA; | 
| 125 |  |  |  | $stored{"$chr:$pos:B"} = uc $allB; | 
| 126 |  |  |  | printf $N_LOC "%s\t%d\n", $chr, $pos; | 
| 127 |  |  |  | } | 
| 128 |  |  |  | close $N_LOC; | 
| 129 |  |  |  | close $SNP6; | 
| 130 |  |  |  |  | 
| 131 |  |  |  | _wrap_c_alleleCounter($self, $bam_file, $tmp_out, $tmp_loci); | 
| 132 |  |  |  |  | 
| 133 | 50 |  |  | open my $cfh, '<', $tmp_out or croak "Unable to open $loci_file for reading: $OS_ERROR\n"; | 
| 134 | 50 |  |  | open my $ofh, '>', $out_file or croak "Unable to open $out_file for writing: $OS_ERROR\n"; | 
| 135 |  |  |  | # header | 
| 136 | 50 |  |  | print $ofh "#CHR\tPOS\tCount_Allele_A\tCount_Allele_B\tGood_depth\n" or croak "Failed to write line: $OS_ERROR\n"; | 
| 137 |  |  |  |  | 
| 138 |  |  |  | while(my $line = <$cfh>) { | 
| 139 | 100 |  |  | next if($line =~ m/^#/); | 
| 140 |  |  |  | chomp $line; | 
| 141 |  |  |  | my ($chr, $pos, $c_a, $c_c, $c_g, $c_t, $depth) = split /\t/, $line; | 
| 142 |  |  |  | my %tmp = ('A', $c_a, 'C', $c_c, 'G', $c_g, 'T', $c_t); | 
| 143 |  |  |  | printf $ofh "%s\t%d\t%d\t%d\t%d\n", $chr, | 
| 144 |  |  |  | $pos, | 
| 145 |  |  |  | $tmp{$stored{"$chr:$pos:A"}}, | 
| 146 |  |  |  | $tmp{$stored{"$chr:$pos:B"}}, | 
| 147 |  |  |  | $depth; | 
| 148 |  |  |  | } | 
| 149 |  |  |  | close $cfh; | 
| 150 |  |  |  | close $ofh; | 
| 151 |  |  |  | return 1; | 
| 152 |  |  |  | } | 
| 153 |  |  |  |  | 
| 154 |  |  |  | =item get_full_loci_profile | 
| 155 |  |  |  |  | 
| 156 |  |  |  | Writes tab seperated allelic counts and depth to specified FH | 
| 157 |  |  |  | Uses all loci defined in specified file | 
| 158 |  |  |  |  | 
| 159 |  |  |  | =cut | 
| 160 |  |  |  | sub get_full_loci_profile { | 
| 161 |  | 6 | 1 | my ($self, $bam_file, $out_file, $loci_file, $min_pbq, $min_mapq, $fasta) = @_; | 
| 162 |  |  |  | $self->configure($bam_file, $min_pbq, $min_mapq, $fasta); | 
| 163 |  |  |  | _wrap_c_alleleCounter($self, $bam_file, $out_file, $loci_file); | 
| 164 |  |  |  | return 1; | 
| 165 |  |  |  | } | 
| 166 |  |  |  |  | 
| 167 |  |  |  | =item gender_chk | 
| 168 |  |  |  |  | 
| 169 |  |  |  | Writes the chromosome name for the Male sex chromosome as defined by loci file and 'Y/N' | 
| 170 |  |  |  | indicating presence of any of the SNPs.  E.g. | 
| 171 |  |  |  |  | 
| 172 |  |  |  | chrX  Y | 
| 173 |  |  |  |  | 
| 174 |  |  |  | or | 
| 175 |  |  |  |  | 
| 176 |  |  |  | X  N | 
| 177 |  |  |  |  | 
| 178 |  |  |  | =cut | 
| 179 |  |  |  | sub gender_chk { | 
| 180 |  | 0 | 1 | my ($self, $bam_file, $out_file, $loci_file, $min_pbq, $min_mapq, $fasta) = @_; | 
| 181 |  |  |  | $self->configure($bam_file, $min_pbq, $min_mapq, $fasta); | 
| 182 |  |  |  | my $tmpdir = tempdir( CLEANUP => 1 ); | 
| 183 |  |  |  | my $tmp_out = $tmpdir.'/gender_chk.out'; | 
| 184 |  |  |  | _wrap_c_alleleCounter($self, $bam_file, $tmp_out, $loci_file); | 
| 185 |  |  |  |  | 
| 186 |  |  |  | my $sex_chr; | 
| 187 |  |  |  | my $is_male = 'N'; | 
| 188 | 0 |  |  | open my $fh, '<', $tmp_out or croak 'Unable to open '.$tmp_out.' for reading'; | 
| 189 |  |  |  | while(my $line = <$fh>) { | 
| 190 | 0 |  |  | next if($line =~ /^#/); | 
| 191 |  |  |  | chomp $line; | 
| 192 |  |  |  | my ($chr, $pos, $depth) = (split /\t/, $line)[0,1,-1]; | 
| 193 | 0 |  |  | if(defined $sex_chr) { | 
| 194 | 0 |  |  | die "Only loci expected on the 'male' sex chromosome should be included in: $loci_file\n\tYou have $sex_chr & $chr so far!\n" if($chr ne $sex_chr); | 
| 195 |  |  |  | } | 
| 196 |  |  |  | else { | 
| 197 |  |  |  | $sex_chr = $chr; | 
| 198 |  |  |  | } | 
| 199 | 0 |  |  | if($depth > 5) { | 
| 200 |  |  |  | $is_male = 'Y'; | 
| 201 |  |  |  | # technically we could stop here, but we should check all the chrs to make sure this isn't the wrong LOCI file | 
| 202 |  |  |  | } | 
| 203 |  |  |  | } | 
| 204 |  |  |  | close $fh; | 
| 205 |  |  |  |  | 
| 206 | 0 |  |  | open my $ofh, '>', $out_file or croak 'Unable to open '.$out_file.' for writing'; | 
| 207 |  |  |  | printf $ofh "%s\t%s\n", $sex_chr, $is_male; | 
| 208 |  |  |  | close $ofh; | 
| 209 |  |  |  | return; | 
| 210 |  |  |  | } | 
| 211 |  |  |  |  | 
| 212 |  |  |  | sub _alleleCounter_c { | 
| 213 |  | 12 |  | my $l_bin = $Bin.'/../../c/bin'; | 
| 214 |  |  |  | my $prog = 'alleleCounter'; | 
| 215 |  |  |  | my $path = File::Spec->catfile($l_bin, $prog); | 
| 216 | 50 |  |  | $path = which($prog) unless(-e $path); | 
| 217 | 50 |  |  | die "Failed to find alleleCounter in path or local bin folder ($l_bin)\n\tPATH: $ENV{PATH}\n" unless(defined $path && -e $path); | 
| 218 |  |  |  | return $path; | 
| 219 |  |  |  | } | 
| 220 |  |  |  |  | 
| 221 |  |  |  | 1; |