#!/usr/bin/perl -w # Name: Jeremy Zucker # email: zucker@research.dfci.harvard.edu # Problem Set: #1 \$DNA_seq = "CATTACGATGCATTG ATTTTTCAAAGGAAT GTACTATCGAAATCA CAAGTCGTGGACTAC GGTTTGCAGTGGAGG AATCGCAGTCTTTGC AGGCTCACGCCTTTC TTGATAAGTCGTTGT TTCAAACGTTTAATT TTCAGGGTGATTCAG ATGGGGATACATATA TGTTCCAGACGATGA TTTCACCT"; \$DNA_seq =~ s/\s+//g; print "Cleaned up DNA sequence of length " . length(\$DNA_seq) .":\n\$DNA_seq\n"; \$RNA_seq = transcribe( \$DNA_seq ); print "\nRNA sequence: \n\$RNA_seq\n"; print "\nTranslated sequence: \n"; for(\$j = 0; \$j<3; \$j++) { %protein = translate(\$RNA_seq, \$j); print "\n\tReading Frame \$j:\n" . \$protein{"sequence"} . "\n"; print_histogram( %protein); } \$reverse_compliment = reverse_compliment( \$RNA_seq); print "\nReverse-complemented RNA sequence:\n \$reverse_compliment \n"; print "\nTranslation of reverse-complemented RNA sequence:\n"; for(\$j = 0; \$j<3; \$j++) { %protein = translate(\$reverse_compliment, \$j); print "\n\tReading Frame \$j:\n" . \$protein{"sequence"} . "\n"; print_histogram( %protein ); } # Input: Hash table of amino acids (keys) and their frequencies (values) # Output: Prints histogram to STDOUT sub print_histogram { my %protein = @_; print "\nHistogram\n"; foreach \$amino_acid (sort keys %protein) { if((\$amino_acid ne "sequence") && (\$amino_acid ne "")){ print "\$amino_acid: " . ("*" x \$protein{\$amino_acid}) . "\n"; } } } # Input: A sequence of characters [AGCU] that represent a biologicaly functional strand of RNA # Output: Reverse Complimented sequence sub reverse_compliment { my \$sequence = shift; \$sequence = reverse( \$sequence ); \$sequence =~ tr/AUGC/UACG/; return \$sequence; } # Input: a sequence of characters in [AGCU] that represents the RNA Sequence, and the reading frame (0, 1, or 2) # Output: A hash table which contains the frequency counts of each amino acid in the sequence and the full translated sequence sub translate { my (\$sequence, \$reading_frame) = @_; my %protein = (); for(\$i=\$reading_frame; \$i < length(\$sequence); \$i+=3) { \$codon = substr( \$sequence, \$i, 3); \$amino_acid = translate_codon( \$codon ); \$protein{\$amino_acid}++; \$protein{"sequence"} .= \$amino_acid; } return %protein; } # Input: The coding strand of the DNA sequence -- NOT the template strand that the RNA polymerase II actually reads to synthesize RNA! # Output: The transcribed RNA sequence. sub transcribe { my \$sequence = shift; \$sequence =~ s/T/U/gi; return \$sequence; } # Input: a sequence of 3 characters that represents the tri-nucleotide codon that tRNA reads # Output: the 3 letter symbolic representation of the corresponding amino acid. sub translate_codon { if (\$_[0] =~ /GC[AGCU]/i) {return Ala;} # If the codon matches G followed by C followed by A, G, C, or U, return Alanine; if (\$_[0] =~ /UGC|UGU/i) {return Cys;} # If the codon matches U followed by G followed by U or C, return Cysteine if (\$_[0] =~ /GAC|GAU/i) {return Asp;} # If the codon matches G followed by A followed by U or C, return Aspartic Acid; if (\$_[0] =~ /GAA|GAG/i) {return Glu;} # If the codon matches G followed by A followed by A or G, return Glutamine; if (\$_[0] =~ /UUC|UUU/i) {return Phe;} # If the codon matches U followed by U followed by U or C, return Phenylalanine; if (\$_[0] =~ /GG[AGCU]/i) {return Gly;} # If the codon matches G followed by G followed by A, G, C, or U, return Glycine; if (\$_[0] =~ /CAC|CAU/i) {return His;} # If the codon matches C followed by A followed by U or C, return Histine; if (\$_[0] =~ /AU[AUC]/i) {return Ile;} # If the codon matches A followed by U followed by A, U or C, return Isoleucine; if (\$_[0] =~ /AAA|AAG/i) {return Lys;} # If the codon matches A followed by A followed by A or G, return Lysine; if (\$_[0] =~ /UUA|UUG|CU[AGCU]/i) {return Leu;} # If the codon matches U followed by U followed by A or G or if the codon matches C followed by U followed by A, G, C, or U, return Leucine; if (\$_[0] =~ /AUG/i) {return Met;} # If the codon matches A followed by U followed by G, return Methionine; if (\$_[0] =~ /AAC|AAU/i) {return Asn;} # If the codon matches A followed by A followed by U or C, return Asparagine; if (\$_[0] =~ /CC[AGCU]/i) {return Pro;} # If the codon matches C followed by C followed by A, G, C, or U, return Proline; if (\$_[0] =~ /CAA|CAG/i) {return Gln;} # If the codon matches C followed by A followed by A or G, return Glutamine; if (\$_[0] =~ /AGA|AGG|CG[AGCU]/i) {return Arg;} # If the codon matches A followed by G followed by A or G or if te codon matches C followed by G followed by A, G, C, or U, return Arginine; if (\$_[0] =~ /AGC|AGU|UC[AGCU]/i) {return Ser;} # If the codon matches A followed by G followed by C or U or if the codon matches U followed by C followed by A, G, C, or U, return Serine; if (\$_[0] =~ /AC[AGCU]/i) {return Thr;} # If the codon matches A followed by C followed by A, G, C, or U, return Threonine; if (\$_[0] =~ /GU[AGCU]/i) {return Val;} # If the codon matches G followed by U followed by A, G, C, or U, return Valine; if (\$_[0] =~ /UGG/i) {return Trp;} # If the codon matches U followed by G followed by G, return Tryptophan; if (\$_[0] =~ /UAC|UAU/i) {return Tyr;} # If the codon matches U followed by A followed by C or U, return Tyrosine; if (\$_[0] =~ /UAA|UGA|UAG/i) {return "***";} # If the codon matches U followed by A followed by A or G or if the codon matches U followed by G followed by A, return a Stop Codon; }