# VitevitchLuce.pl # This program computes the phonotactic probability metrics employed by # Jusczyk et al in their study of infants' knowledge of phonotactics # It uses the wordlist CelexWordsInTranscription.txt to calculate # probabilities from the English lexicon, and then applies these probabilities # to test items, to give predicted well-formedness scores. $count_monosyllables_only = 0; $vowels = "aeiouAEIOU&V\@3Ì¯¿®ŽYšWo•‘ŸBML"; print "\nCounting frequencies in wordlist\n"; open (WORDLIST, "CelexWordsInTranscription.txt") or die "Warning! Can't open wordlist: $!\n"; while ($line = ) { chomp($line); ($freq, $orthography, $transcription) = split("\t", $line); $original_transcription = $transcription; # The Vitevich & Luce ignore syllabification when calculating probabilities; # so, delete syllable boundaries $transcription =~ s/\[//g; $transcription =~ s/\]//g; # It will help a lot to have every phoneme represented by a single character $transcription = remove_digraphs($transcription); # Now look at which phonemes occur in which positions for (my $i = 0; $i < length($transcription); $i++) { # The current segment is the one is position i $PhonemeCount[$i]{ substr($transcription, $i, 1) } += log10($freq); # print "$i\t".substr($transcription, $i, 1)."\n"; $TotalPhones[$i] += log10($freq); $AllPhonemes{ substr($transcription, $i, 1) } = 1; } # And then count the biphones that occur in each position for (my $i = 0; $i < length($transcription) - 1; $i++) { # The current biphone is the one in position i $BiphoneCount[$i]{ substr($transcription, $i, 2) } += log10($freq); $AllBiphones{substr($transcription, $i, 2)} = 1; $TotalBiphones[$i] += log10($freq); } } close (WORDLIST); $positions = scalar @PhonemeCount; print "Last position used by any word: $positions\n"; # Now that we have counted the phonemes in each position, tally the probabilities # First, the positional probabilities print "\nCalculating positional probabilities\n"; for ($i = 0; $i < $positions; $i++) { foreach $phoneme (keys %AllPhonemes) { $PositionalProbability[$i]{ $phoneme } = $PhonemeCount[$i]{$phoneme} / $TotalPhones[$i]; } } # Let's save the results in a file open (POSPROBFILE, ">CelexPositionalProbabilities.txt") or die "Warning! Can't create positional probabilties file: $!\n"; printf POSPROBFILE "Phoneme"; for (my $i = 0; $i < $positions; $i++) { printf POSPROBFILE "\t"; printf POSPROBFILE $i+1; } printf POSPROBFILE "\n"; foreach $phoneme (sort keys %AllPhonemes) { printf POSPROBFILE replace_digraphs($phoneme); for (my $i = 0; $i < $positions; $i++) { printf POSPROBFILE "\t$PositionalProbability[$i]{$phoneme}"; } printf POSPROBFILE "\n"; } close POSPROBFILE; # Then calculate the biphone probabilities # Unlike standard n-grams, these are simply calculated a positional probabilities of 2-phoneme # sequences (just like the positional probabilities) print "\nCalculating biphone probabilities\n"; for ($i = 0; $i < $positions - 1; $i++) { foreach $biphone (keys %AllBiphones) { $PositionalProbability[$i]{ $biphone } = $BiphoneCount[$i]{ $biphone } / $TotalBiphones[$i]; } } # Let's save the results in a file open (BIPHONFILE, ">CelexBiphoneProbabilities.txt") or die "Warning! Can't create biphone probabilties file: $!\n"; printf BIPHONFILE "Biphone"; for (my $i = 0; $i < $positions - 1; $i++) { printf BIPHONFILE "\t"; printf BIPHONFILE $i+1; } printf BIPHONFILE "\n"; foreach $biphone (sort keys %AllBiphones) { printf BIPHONFILE replace_digraphs($biphone); for (my $i = 0; $i < $positions - 1; $i++) { printf BIPHONFILE "\t$PositionalProbability[$i]{$biphone}"; } printf BIPHONFILE "\n"; } close BIPHONFILE; # Finally, use these numbers to calculate probabilities for made-up words # (In this case, Jusczyk et al's experimental stimuli) # We have four files to go through, so I'll put it in a loop. print "\nDeriving predictions for test items\n"; @stimfiles = ("Exp3-Low.txt", "Exp3-High.txt", "Exp1-Low.txt", "Exp1-High.txt", "AlbrightHayes.txt"); foreach $file (@stimfiles) { open (STIMFILE, $file) or die "Warning! Can't open stimulus file: $!\n"; $file_prefix = $file; $file_prefix =~ s/.txt$//; open (PREDICTIONSFILE, ">$file_prefix-Predictions.txt") or die "Warning! Can't create file to store predictions for $file\n$!\n"; printf PREDICTIONSFILE "Stimulus\tSummed Positional Prob.\tSummed Biphone Prob.\n"; $number_of_items = 0; $SummedPositionalProb = 0; $SummedBiphoneProb = 0; $CumulativePositionalProb = 0; $CumulativeBiphoneProb = 0; while ($line = ) { chomp($line); $word = remove_digraphs($line); $number_of_items++; # Calculate the positional probabilities $SummedPositionalProb = 0; $SummedBiphoneProb = 0; for (my $i = 0; $i < length($word); $i++) { $SummedPositionalProb += $PositionalProbability[$i]{ substr($word, $i, 1) }; } for (my $i = 0; $i < length($word) - 1; $i++) { $SummedBiphoneProb += $PositionalProbability[$i]{ substr($word, $i, 2) }; } $CumulativePositionalProb += $SummedPositionalProb; $CumulativeBiphoneProb += $SummedBiphoneProb; # Write the results to a file printf PREDICTIONSFILE "$line\t$SummedPositionalProb\t$SummedBiphoneProb\n"; } printf PREDICTIONSFILE "\nAVERAGE:\t" . $CumulativePositionalProb / $number_of_items . "\t" . $CumulativeBiphoneProb / $number_of_items . "\n"; close (PREDICTIONSFILE); } sub log10 { my $n = @_[0]; return log($n) / log(10); } sub remove_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i:/i/g; $string =~ s/A:/A/g; $string =~ s/u:/u/g; $string =~ s/3:/3/g; $string =~ s/A~:/Ì/g; $string =~ s/O~:/¯/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/O:/¿/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/&~(:)/®/g; $string =~ s/eI/Ž/g; $string =~ s/aI/Y/g; $string =~ s/OI/š/g; $string =~ s/aU/W/g; $string =~ s/\@U/o/g; # The following usually correspond to r in American English $string =~ s/I\@/•/g; $string =~ s/E\@/‘/g; $string =~ s/U\@/Ÿ/g; # Also some consonant digraphs $string =~ s/dZ/J/g; $string =~ s/tS/C/g; $string =~ s/n,/B/g; # totally arbitrary; N is taken $string =~ s/m,/M/g; $string =~ s/l,/L/g; $string =~ s/r\*/R/g; return $string; } sub replace_digraphs { $string = @_[0]; # Some digraphs indicate length redundantly on tense vowels; removing # the colon won't result in any neutralizations $string =~ s/i/i:/g; $string =~ s/A/A:/g; $string =~ s/u/u:/g; $string =~ s/3/3:/g; $string =~ s/Ì/A~:/g; $string =~ s/¯/O~:/g; # Some tense vowels have lax correspondents with the same symbol; # have to change $string =~ s/¿/O:/g; # Nasalized short and long ¾ doesn't even seem like a real distinction; # I'm going to neutralize them $string =~ s/®/&~/g; $string =~ s/Ž/eI/g; $string =~ s/Y/aI/g; $string =~ s/š/OI/g; $string =~ s/W/aU/g; $string =~ s/o/\@U/g; $string =~ s/•/I\@/g; $string =~ s/‘/E\@/g; $string =~ s/Ÿ/U\@/g; # Also some consonant digraphs $string =~ s/J/dZ/g; $string =~ s/C/tS/g; $string =~ s/B/n,/g; # totally arbitrary; N is taken $string =~ s/M/m,/g; $string =~ s/L/l,/g; $string =~ s/R/r\*/g; return $string; }