#!/usr/bin/perl -w # Name: Jeremy Zucker # email: zucker@research.dfci.harvard.edu # Problem Set: #1 $DNA_seq = "CATTACGATGCATTG ATTTTTCAAAGGAAT GTACTATCGAAATCA CAAGTCGTGGACTAC GGTTTGCAGTGGAGG AATCGCAGTCTTTGC AGGCTCACGCCTTTC TTGATAAGTCGTTGT TTCAAACGTTTAATT TTCAGGGTGATTCAG ATGGGGATACATATA TGTTCCAGACGATGA TTTCACCT"; $DNA_seq =~ s/\s+//g; print "Cleaned up DNA sequence of length " . length($DNA_seq) .":\n$DNA_seq\n"; $RNA_seq = transcribe( $DNA_seq ); print "\nRNA sequence: \n$RNA_seq\n"; print "\nTranslated sequence: \n"; for($j = 0; $j<3; $j++) { %protein = translate($RNA_seq, $j); print "\n\tReading Frame $j:\n" . $protein{"sequence"} . "\n"; print_histogram( %protein); } $reverse_compliment = reverse_compliment( $RNA_seq); print "\nReverse-complemented RNA sequence:\n $reverse_compliment \n"; print "\nTranslation of reverse-complemented RNA sequence:\n"; for($j = 0; $j<3; $j++) { %protein = translate($reverse_compliment, $j); print "\n\tReading Frame $j:\n" . $protein{"sequence"} . "\n"; print_histogram( %protein ); } # Input: Hash table of amino acids (keys) and their frequencies (values) # Output: Prints histogram to STDOUT sub print_histogram { my %protein = @_; print "\nHistogram\n"; foreach $amino_acid (sort keys %protein) { if(($amino_acid ne "sequence") && ($amino_acid ne "")){ print "$amino_acid: " . ("*" x $protein{$amino_acid}) . "\n"; } } } # Input: A sequence of characters [AGCU] that represent a biologicaly functional strand of RNA # Output: Reverse Complimented sequence sub reverse_compliment { my $sequence = shift; $sequence = reverse( $sequence ); $sequence =~ tr/AUGC/UACG/; return $sequence; } # Input: a sequence of characters in [AGCU] that represents the RNA Sequence, and the reading frame (0, 1, or 2) # Output: A hash table which contains the frequency counts of each amino acid in the sequence and the full translated sequence sub translate { my ($sequence, $reading_frame) = @_; my %protein = (); for($i=$reading_frame; $i < length($sequence); $i+=3) { $codon = substr( $sequence, $i, 3); $amino_acid = translate_codon( $codon ); $protein{$amino_acid}++; $protein{"sequence"} .= $amino_acid; } return %protein; } # Input: The coding strand of the DNA sequence -- NOT the template strand that the RNA polymerase II actually reads to synthesize RNA! # Output: The transcribed RNA sequence. sub transcribe { my $sequence = shift; $sequence =~ s/T/U/gi; return $sequence; } # Input: a sequence of 3 characters that represents the tri-nucleotide codon that tRNA reads # Output: the 3 letter symbolic representation of the corresponding amino acid. sub translate_codon { if ($_[0] =~ /GC[AGCU]/i) {return Ala;} # If the codon matches G followed by C followed by A, G, C, or U, return Alanine; if ($_[0] =~ /UGC|UGU/i) {return Cys;} # If the codon matches U followed by G followed by U or C, return Cysteine if ($_[0] =~ /GAC|GAU/i) {return Asp;} # If the codon matches G followed by A followed by U or C, return Aspartic Acid; if ($_[0] =~ /GAA|GAG/i) {return Glu;} # If the codon matches G followed by A followed by A or G, return Glutamine; if ($_[0] =~ /UUC|UUU/i) {return Phe;} # If the codon matches U followed by U followed by U or C, return Phenylalanine; if ($_[0] =~ /GG[AGCU]/i) {return Gly;} # If the codon matches G followed by G followed by A, G, C, or U, return Glycine; if ($_[0] =~ /CAC|CAU/i) {return His;} # If the codon matches C followed by A followed by U or C, return Histine; if ($_[0] =~ /AU[AUC]/i) {return Ile;} # If the codon matches A followed by U followed by A, U or C, return Isoleucine; if ($_[0] =~ /AAA|AAG/i) {return Lys;} # If the codon matches A followed by A followed by A or G, return Lysine; if ($_[0] =~ /UUA|UUG|CU[AGCU]/i) {return Leu;} # If the codon matches U followed by U followed by A or G or if the codon matches C followed by U followed by A, G, C, or U, return Leucine; if ($_[0] =~ /AUG/i) {return Met;} # If the codon matches A followed by U followed by G, return Methionine; if ($_[0] =~ /AAC|AAU/i) {return Asn;} # If the codon matches A followed by A followed by U or C, return Asparagine; if ($_[0] =~ /CC[AGCU]/i) {return Pro;} # If the codon matches C followed by C followed by A, G, C, or U, return Proline; if ($_[0] =~ /CAA|CAG/i) {return Gln;} # If the codon matches C followed by A followed by A or G, return Glutamine; if ($_[0] =~ /AGA|AGG|CG[AGCU]/i) {return Arg;} # If the codon matches A followed by G followed by A or G or if te codon matches C followed by G followed by A, G, C, or U, return Arginine; if ($_[0] =~ /AGC|AGU|UC[AGCU]/i) {return Ser;} # If the codon matches A followed by G followed by C or U or if the codon matches U followed by C followed by A, G, C, or U, return Serine; if ($_[0] =~ /AC[AGCU]/i) {return Thr;} # If the codon matches A followed by C followed by A, G, C, or U, return Threonine; if ($_[0] =~ /GU[AGCU]/i) {return Val;} # If the codon matches G followed by U followed by A, G, C, or U, return Valine; if ($_[0] =~ /UGG/i) {return Trp;} # If the codon matches U followed by G followed by G, return Tryptophan; if ($_[0] =~ /UAC|UAU/i) {return Tyr;} # If the codon matches U followed by A followed by C or U, return Tyrosine; if ($_[0] =~ /UAA|UGA|UAG/i) {return "***";} # If the codon matches U followed by A followed by A or G or if the codon matches U followed by G followed by A, return a Stop Codon; }