#!/usr/bin/perl ############################################################################### # # Section7.pl # # Could English phonology explain the results of Experiments 1 and 2, as an # alternative to language-independent analytic bias? Test three hypotheses: # # 1. HH- and VV-conforming patterns are overrepresented in the lexicon # compared to what would be expected if the segments combined # independently, whereas HV- conforming patterns are underrepresented # (or less overrepresented). Hence, participants enter the experiment # primed to acquire the HH and VV patterns faster than the HV one. # # 2. HH- and VV-nonconforming items occur more frequently in English than # do HV-nonconforming ones, so that (e.g.) a GLA learner demotes the # relevant markedness constraints faster. # # 3. The HH-, HV-, and VV-conforming stimulus words are more like real words, # hence more memorable, hence have a bigger attractive effect on # participant responses. # ############################################################################### $CELEXdir = "/Users/moreton/Documents/Work/DBs/CELEX"; for ('EPW', 'EFW') { open ($_, "<$CELEXdir/$_.CD") or die "Can't open $_.CD"; } $CELEXcombinedSize = 17.9; # million $CELEXspokenSize = 1.3; # million # CELEX consonant symbols $C = 'pbtdkgNmnlrfvTDszSZjxhwJ_CFHPR'; $voicelessC = 'ptkfTsSJxh'; $voicedC = 'bdgvDzZ_mnNlrjw'; $voicelessObst = 'ptkfTsSJx'; $voicedObst = 'bdgvDzZ_'; $syllabicC = 'CFHP'; $experimentC = 'tkdg'; # Break vowels into 4 height classes, so that we can try fiddling with # how we group them into high and low. $V = 'IE{VQU\@i\#$u312456789cq0~'; @V = ( 'iuIU79', '15', 'EV\@3\$8', '\{Q\#cq0\~' ); $experimentV = 'iu\{$'; $syllbdry = q{\'\"\-}; #______________________________________________________________________________ # Get frequencies of each pronunciation (many words are pronounced alike). #______________________________________________________________________________ while ($_ = ) { ($IdNumEPW, $WordEPW, $CobEPW, $IdNumLemmaEPW, $PronCnt, $PronStatus, $PhonStrsDISC, $PhonCVBr, $PhonSylBCLX, @rest) = split /\\/; while ($_ = ) { ($IdNumEFW, $WordEFW, $IdNumLemmaEFW, $CobEFW, $CobDev, $CobMln, $CobLog, $CobW, $CobWMln, $CobWLog, $CobS, $CobSMln, $CobSLog) = split /\\/; last if ("$IdNumEFW $WordEFW $CobEFW" eq "$IdNumEPW $WordEPW $CobEPW"); if ($IdNumEFW > $IdNumEPW) { die "Couldn't find $IdNumEPW $WordEPW in EFW.CD"; } } # Rules for counting by types: (1) Ignore zero-frequency wordforms; only # count a form that actually occurs. (2) N lexical items with the same # pronunciation count N times; e.g., "see" and "sea" count as 2. $freqTab {$PhonStrsDISC} {WStype} += ($CobEFW > 0); $freqTab {$PhonStrsDISC} {Wtype} += ($CobW > 0); $freqTab {$PhonStrsDISC} {Stype} += ($CobS > 0); $freqTab {$PhonStrsDISC} {WStoken} += $CobEFW; $freqTab {$PhonStrsDISC} {Wtoken} += $CobW; $freqTab {$PhonStrsDISC} {Stoken} += $CobS; } # Now, what should we count? The associate editor is suggesting that, if a # bias exists in the English lexicon, it might facilitate acquisition of a # corresponding pattern in the experiment. I can think of two ways in which # this might happen. (A) By inducing a ranking among inactive constraints, # which makes the ranking easier to learn in the experiment. (B) By making # pattern-conforming familiarization items more similar to real words, hence # easier to remember, hence more effective in the test stage. # # To test (A), we would be looking for general, grammatical-type patterns, # rather than similarity to individual words. Thus, we would not want to # restrict ourselves to disyllables with stress patterns and segments similar # to those used in the experiments; rather, we would want to look at as much # of the lexicon as possible. To test (B), we would want to look at how # similar the average HH-, HV-, or VV-conforming familiarization item is to # other items in the lexicon -- i.e, we would want neighborhood statistics. # # (A) includes Hypotheses 1 and 2; (B) is Hypothesis 3. #______________________________________________________________________________ # Hypothesis 1: HH-, HV-, and VV-conforming patterns are overrepresented in # the lexicon compared to what would be expected if the segments combined # independently. # # Note that it would be wrong to simply compute what proportion of the # vowel-vowel sequences are HH-harmonic, because that is guaranteed to be # above 50% if the high vowels outnumber the low or vice versa and the # two classes are independent --- if you toss an unfair coin twice, you are # more than 50% likely to get the same side twice, since p^2 + (1-p)^2 # = p^2 + 1 - 2p + p^2, so deriv. is 2p - 2 + 2p, which is zero at p=1/2, # and that is a minimum, so p^2+(1-p)^2 >= 1/4+1/4= 2. #______________________________________________________________________________ # Change the following to move the line separating ``high'' from ``not high''. $highV = $V[0].$V[1]; $lowV = $V[2].$V[3]; # Regexps for detecting conforming and nonconforming patterns: $HH = qr/[$highV](?=[$C$syllbdry]*[$highV])/x; $hh = qr/[$lowV] (?=[$C$syllbdry]*[$lowV]) /x; $Hh = qr/[$highV](?=[$C$syllbdry]*[$lowV]) /x; $hH = qr/[$lowV] (?=[$C$syllbdry]*[$highV])/x; $HV = qr/[$highV][$syllbdry]*[$voicedObst] /x; $hv = qr/[$lowV] [$syllbdry]*[$voicelessObst]/x; $Hv = qr/[$highV][$syllbdry]*[$voicelessObst]/x; $hV = qr/[$lowV] [$syllbdry]*[$voicedObst] /x; $VV = qr/ (?:^|[$syllbdry]) [$voicedObst] (?=[^$syllbdry]*[$syllbdry]+[$voicedObst].* [$V]) /x; $vv = qr/ (?:^|[$syllbdry]) [$voicelessObst](?=[^$syllbdry]*[$syllbdry]+[$voicelessObst].*[$V]) /x; $Vv = qr/ (?:^|[$syllbdry]) [$voicedObst] (?=[^$syllbdry]*[$syllbdry]+[$voicelessObst].*[$V]) /x; $vV = qr/ (?:^|[$syllbdry]) [$voicelessObst](?=[^$syllbdry]*[$syllbdry]+[$voicedObst].* [$V]) /x; for (keys %freqTab) { $pron = $_; # Count pattern occurrences for $pattern ( HH, hh, Hh, hH, HV, hv, Hv, hV, VV, vv, Vv, vV ) { $re = $ {$pattern}; @matches = ($pron =~ /$re/g); $matches = 0+@matches; # print "$pron $pattern @matches\n"; # DEBUG aid for $corpus (WS, S) { for $unit (type, token) { $patternTab {$pattern} {$corpus} {$unit} += $freqTab {$pron} {"$corpus$unit"} * $matches; $wordTab {$pattern} {$corpus} {$unit} += $freqTab {$pron} {"$corpus$unit"} * ($matches > 0); } } } } # (DEBUG aid:) # for $pattern ( # HH, hh, Hh, hH, # HV, hv, Hv, hV, # VV, vv, Vv, vV # ) { # printf "$pattern\t"; # for $corpus (WS, S) { # for $unit (type, token) { # printf "%6d", $patternTab {$pattern}{$corpus} {$unit}; # # } # print "\t"; # } # print "\n"; # } print "============\nHypothesis 1\n============\n"; print "\n"; print "_"x(7+2+(7+1)*4), "\n"; printf "%-7s %-14s %-14s\n", "", "Written+Spoken", "Spoken"; printf "%-7s %-14s %-14s\n", "", "_"x14, "_"x14; printf "%-7s %-7s %-7s %-7s %-7s\n", "Pattern", "Types", "Tokens", "Types", "Tokens"; print "_"x(7+2+(7+1)*4), "\n"; for $condition (HH, HV, VV) { printf "%-7s ", "$condition"; $AB = $condition; $ab = "\L$condition"; $Ab = "\u$ab"; $aB = "\l$AB"; for $corpus (WS, S) { for $unit (type, token) { $ABct = $patternTab {$AB} {$corpus} {$unit}; $abct = $patternTab {$ab} {$corpus} {$unit}; $Abct = $patternTab {$Ab} {$corpus} {$unit}; $aBct = $patternTab {$aB} {$corpus} {$unit}; $act_conforming = ($ABct + $abct)/($ABct + $abct + $Abct + $aBct); $A = $ABct + $Abct; $a = $aBct + $abct; $B = $ABct + $aBct; $b = $Abct + $abct; $pAa = $A/($A+$a); $pBb = $B/($B+$b); $exp_conforming = $pAa*$pBb + (1-$pAa)*(1-$pBb); printf "%7.3f ", $act_conforming/$exp_conforming; } } print "\n"; } print "_"x(7+2+(7+1)*4), "\n"; print "\n\n"; #______________________________________________________________________________ # Hypothesis 2: HH- and VV-nonconforming instances occur more frequently # than HV-nonconforming ones, leading (e.g.) GLA to demote the relevant # constraints faster. #______________________________________________________________________________ print "============\nHypothesis 2\n============\n"; # By number of violating instances (words with multiple violations count # multiple times): print "\n"; print "By number of violations:\n"; print "_"x(7+2+(7+1)*4), "\n"; printf "%-7s %-14s %-14s\n", "", "Written+Spoken", "Spoken"; printf "%-7s %-14s %-14s\n", "", "_"x14, "_"x14; printf "%-7s %-7s %-7s %-7s %-7s\n", "Pattern", "Types", "Tokens", "Types", "Tokens"; print "_"x(7+2+(7+1)*4), "\n"; for $condition (HH, HV, VV) { print "$condition\t"; $AB = $condition; $ab = "\L$condition"; $Ab = "\u$ab"; $aB = "\l$AB"; for $corpus (WS, S) { for $unit (type, token) { $Abct = $patternTab {$Ab} {$corpus} {$unit}; $aBct = $patternTab {$aB} {$corpus} {$unit}; $act_nonconforming = $Abct + $aBct; printf "%7d ", $act_nonconforming; } } print "\n"; } print "_"x(7+2+(7+1)*4), "\n"; # By number of words containing at least one violation: print "\n\n"; print "By number of words containing at least one violation:\n"; print "_"x(7+2+(7+1)*4), "\n"; printf "%-7s %-14s %-14s\n", "", "Written+Spoken", "Spoken"; printf "%-7s %-14s %-14s\n", "", "_"x14, "_"x14; printf "%-7s %-7s %-7s %-7s %-7s\n", "Pattern", "Types", "Tokens", "Types", "Tokens"; print "_"x(7+2+(7+1)*4), "\n"; for $condition (HH, HV, VV) { print "$condition\t"; $AB = $condition; $ab = "\L$condition"; $Ab = "\u$ab"; $aB = "\l$AB"; for $corpus (WS, S) { for $unit (type, token) { $Abct = $wordTab {$Ab} {$corpus} {$unit}; $aBct = $wordTab {$aB} {$corpus} {$unit}; $act_nonconforming = $Abct + $aBct; printf "%7d ", $act_nonconforming; } } print "\n"; } print "_"x(7+2+(7+1)*4), "\n"; print "\n"; #______________________________________________________________________________ # Hypothesis 3: The HH-, HV-, and VV-conforming stimulus words are more # like real words (hence more memorable). #______________________________________________________________________________ # Test this using the single-edit-distance neighborhoods. # Make list of the 256 experimental stimuli @experimentC = (t, d, k, g); @experimentV = ('i', 'u', "\}", "\$"); for $C1 (@experimentC) { for $V1 (@experimentV) { for $C2 (@experimentC) { for $V2 (@experimentV) { push @expWds, "$C1$V1$C2$V2"; } } } } # Compile nbhd stats for each pattern type for (keys %freqTab) { $realWd = $_; $realWdSegs = $realWd; $realWdSegs =~ s/[$syllbdry]+//g; next if (length ($realWdSegs) < 3 or length ($realWdSegs) > 5); for (@expWds) { $HH = (/(.[iu].[iu]|.[\{\$].[\{\$])/) ? '+' : '-'; $HV = (/(.[iu][dg].|.[\{\$][tk].)/) ? '+' : '-'; $VV = (/([tk].[tk].|[dg].[dg].)/) ? '+' : '-'; # print "$_\t$realWd\t$realWdSegs\t $HH $HV $VV\n"; if (neighbors ($_, $realWdSegs)) { # print "neighbors: $_, $realWdSegs\n"; for $corpus (WS, S) { for $unit (type, token) { # $nbhd {$expWd} {$corpus} {$unit} += $freqTab {$realWd} {"$corpus$unit"}; $nbhdTab {HH}{$HH}{$corpus}{$unit} += $freqTab {$realWd} {"$corpus$unit"}; $nbhdTab {HV}{$HV}{$corpus}{$unit} += $freqTab {$realWd} {"$corpus$unit"}; $nbhdTab {VV}{$VV}{$corpus}{$unit} += $freqTab {$realWd} {"$corpus$unit"}; } } } } } print "============\nHypothesis 3\n=============\n"; print "_" x ((7+1)*9), "\n"; printf "%-7s %31s %31s\n", "", "Written+Spoken", "Spoken"; printf "%-7s %31s %31s\n", "", "_"x31, "_"x31; printf "%-7s %15s %15s %15s %15s\n", "", "Types", "Tokens", "Types", "Tokens"; printf "%-7s %15s %15s %15s %15s\n", "", "_"x15, "_"x15, "_"x15, "_"x15; printf "%-7s %7s %7s %7s %7s %7s %7s %7s %7s\n", "Pattern", "Conf", "Nonconf", "Conf", "Nonconf", "Conf", "Nonconf", "Conf", "Nonconf"; print "_" x ((7+1)*9), "\n"; for $pattern (HH, HV, VV) { printf "%-7s ", $pattern; printf "%7.2f ", $nbhdTab {$pattern}{'+'}{WS}{type}/@expWds; printf "%7.2f ", $nbhdTab {$pattern}{'-'}{WS}{type}/@expWds; printf "%7.2f ", ($nbhdTab {$pattern}{'+'}{WS}{token}/@expWds)/$CELEXcombinedSize; printf "%7.2f ", ($nbhdTab {$pattern}{'-'}{WS}{token}/@expWds)/$CELEXcombinedSize; printf "%7.2f ", $nbhdTab {$pattern}{'+'}{S}{type}/@expWds; printf "%7.2f ", $nbhdTab {$pattern}{'-'}{S}{type}/@expWds; printf "%7.2f ", ($nbhdTab {$pattern}{'+'}{S}{token}/@expWds)/$CELEXspokenSize; printf "%7.2f\n", ($nbhdTab {$pattern}{'-'}{S}{token}/@expWds)/$CELEXspokenSize; } print "_" x ((7+1)*9), "\n"; ############################################################################## # # neighbors # # Determine whether two strings are a single edit distance apart. # ############################################################################## sub neighbors { my ($a, $b) = @_; if (length ($a) > length ($b)) { ($a, $b) = ($b, $a); } if (length ($a) == 0) { return (length ($b) <= 1); } # $b could be $a plus a prefix. ($carb, $cdrb) = ($b =~ /(.)(.*)/); if ($a eq $cdrb) { return (1); } # Or, $b could be $a, with the initial segment changed ($cara, $cdra) = ($a =~ /(.)(.*)/); if ($cdra eq $cdrb) { return (1); } # Or, $b could begin just like $a, with the single-edit difference # coming later. if ($cara eq $carb) { return (neighbors ($cdra, $cdrb)); } 0; }