Content - 21f0d8eb8ea501cadf460312b03e99edc8151613 - c802c4d/CGN_Fig2.py

CGN_Fig2.py
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Wed May 20 11:26:42 2020

@author: sanoev

This script extracts and plot the basic tmeporal variation in the syllables and words of the CGN related to Figure 2 of the main manuscript.
"""
base = ''

#%% locs
ID = 'S05_00'
SEQUENCE_LEN = 10
MIN_WORD_FREQUENCY = 10
STEP = 1

#from sklearn.linear_model import LinearRegression
import pickle
import numpy as np
import pandas as pd
import scipy
import scipy.stats
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.formula.api import ols
import ColorScheme as cs
cmaps = cs.CCcolormap()
bfi = cs.baseFigInfo()

#%% info
loadloc = './Data/'
    
filename = loadloc + '/WordLevel_TrainAndTestData'          
with open(filename, 'rb') as f:
    TrainAndTest = pickle.load(f)

# save this, so you can also run stuff just ont he validation set:
filename = loadloc + '/TrainAndTestSplits'            
with open(filename, 'rb') as f:  # Python 3: open(..., 'wb')
    TT = pickle.load(f)  
TT_sentences = TT[0]
TT_next_words = TT[1]
TT_index = TT[4][:TT[5]]
TT_sentences_test = TT[2]
TT_next_words_test= TT[3]
TT_index_test = TT[4][TT[5]:]
    
sentences = TrainAndTest[0]
next_words = TrainAndTest[1]
words = TrainAndTest[4]

word_indices = dict((c, i) for i, c in enumerate(words))
indices_word = dict((i, c) for i, c in enumerate(words))

#% get panda info (with ISI etc)
filename = loadloc + '/WordLevel_TrainAndTestData_PDinfo_light'            
with open(filename, 'rb') as f:  # Python 3: open(..., 'wb')
    PandaInfo = pickle.load(f)
    
#%% go through all and extract the duration 
WordInfo = pd.DataFrame({'word':str(),
                       'ID': str(),
                       'N': int(),
                       'mean': float(),
                       'std': float(),
                       'median': float(),
                       'IQR': float()}, index = [0])

for wrcnt, word in enumerate(words):
    Durs = []
    for filecnt, subPan in enumerate(PandaInfo[2]):
         Durs = Durs + list(subPan['End'][subPan['Word']==word] - subPan['Begin'][subPan['Word']==word])
    Dseries = [word, words[word][1], len(Durs), np.mean(Durs), np.std(Durs), np.median(Durs), scipy.stats.iqr(Durs)]
    WordInfo.loc[wrcnt] = Dseries

#% save the final wordlist panda info
#filename = loadloc + '/CorpusInfo'            
#with open(filename, 'wb') as f: 
#    pickle.dump([WordInfo],f)      
    
#%% go to celex to get some additional information
filename = loadloc + '/CorpusInfo'            
with open(filename, 'rb') as f: 
    WordInfo = pickle.load(f)[0]

# dump all words as text file so celex can extract information about syllable features
#fileO = open(loadloc +'/worddump.txt', 'w+')
#for word in words:
#    fileO.write(word + '\n')
#fileO.close()

# read the Celex Data
filename = loadloc +'/celexwords.txt'
Cel = pd.read_csv(filename, delimiter = '\\', engine = 'python')

CelinWI = pd.DataFrame({'Celexword':str(),
                           'WordSyl': str(),
                           'Nsyl': int(),
                           'Nsylphon': int(),
                           'PhonSylCLX': str(),
                           'PhonCLX': str(),
                           'Freq': float(),
                           'FreqLog': float(),
                           'length': int()}, index = [0])
# go through the wordInfo and add celex info (if excisting):
for it in range(len(WordInfo)):
    BI = Cel['Word'] == WordInfo['word'][it]
    if BI.any():
        CelinWI.loc[it] = Cel[BI].values.tolist()[0] + [len(WordInfo['word'][it])]
    else:
        CelinWI.loc[it] = ['', '', 0,0,'','', 0.0, 0.0 ,len(WordInfo['word'][it])]
WordInfo_c = pd.concat([WordInfo, CelinWI], axis=1, sort = False)

#% save the final wordlist panda info
#filename = loadloc + '/CorpusInfoAndCelex'            
#with open(filename, 'wb') as f:  
#    pickle.dump([WordInfo_c],f)   

###################################     
#%% plot some descriptives:
filename = loadloc + '/CorpusInfoAndCelex'            
with open(filename, 'rb') as f:  
    WordInfo_c = pickle.load(f)[0] 

#%% plot some examples of words from the corpus
subwords = ['van', 'zijn', 'snel','stem','hebben', 'eten', 'volgen', 'dragen']
freqs = np.zeros(len(subwords))
AD = list()
for wrcnt, word in enumerate(subwords):
    BI = WordInfo_c['word'] == word
    freqs[wrcnt] = WordInfo_c[BI].Freq.values[0]
    Durs = []
    for filecnt, subPan in enumerate(PandaInfo[2]):
         Durs = Durs + list(subPan['End'][subPan['Word']==word] - subPan['Begin'][subPan['Word']==word])
    AD.append(Durs)

#%% Figure 2A
axs = list()
fig = plt.figure(constrained_layout=True, figsize = (bfi.figsize.Col2,1.25)) # iin inches
gs = fig.add_gridspec(1, 8)
for wrcnt, word in enumerate(subwords):    
    axs.append(fig.add_subplot(gs[0,wrcnt]))   
    sns.distplot(AD[wrcnt],ax=axs[-1]).set_title('{} ({:.1f})'.format(word, freqs[wrcnt]))  
    axs[-1].set_xlim([0,1]); axs[-1].set_xticks([0,0.5,1])
    axs[-1].set_ylim([0, 10])
    plt.text(0.95, 0.9, '{:.3f}'.format(np.mean(AD[wrcnt]))[1:], transform=axs[-1].transAxes, \
             horizontalalignment='right',verticalalignment='top')
    if wrcnt == len(subwords)-1:
        axs[-1].tick_params(axis='x', which='both', bottom=True, top=False, labelbottom=True)
        axs[-1].set_xlabel('duration (sec)')
    else:
        axs[-1].tick_params(axis='x', which='both', bottom=True, top=False, labelbottom=False)
    axs[-1].tick_params(axis='y', which='both', right=False, left=False, labelleft=False)
plt.show()
#fig.savefig(figloc + ID + '_ExampleTimeDistWords.pdf', format='pdf')
#fig.savefig(figloc + ID + '_ExampleTimeDistWords.jpg', format='jpg')

#%% plot overall mean + std (independent of word length)
axs = list()
fig = plt.figure(constrained_layout=True, figsize = (bfi.figsize.Col1,2))
gs = fig.add_gridspec(1, 2)
axs.append(fig.add_subplot(gs[0,0]))
BI = np.isnan(WordInfo_c['mean']) ==False
sns.distplot(WordInfo_c[BI]['mean'], ax=axs[-1]).set_title('Mean ({:.3f})'.format(np.mean(WordInfo_c[BI]['mean'])))
axs.append(fig.add_subplot(gs[0,1]))
sns.distplot(WordInfo_c[BI]['std'], ax=axs[-1]).set_title('Std ({:.3f})'.format(np.mean(WordInfo_c[BI]['std'])))
 
#fig.savefig(figloc + ID + '_MeanStd.pdf', format='pdf')
#fig.savefig(figloc + ID + '_MeanStd.jpg', format='jpg')   

#%% word length:
unwl = [2,3,4,5,6,7,8,9,10,11,12,13,14,15] # word lengths to plot
axs = list()
fig = plt.figure(constrained_layout=True, figsize = (bfi.figsize.Col2,7))
Ncol = 4
gs = fig.add_gridspec(5, Ncol)
pl_row = 0
pl_col = 0
Omeandur= np.zeros(len(unwl))
Ostddur=np.zeros(len(unwl))
for it,wl in enumerate(unwl):
    axs.append(fig.add_subplot(gs[pl_row,pl_col]))
    pl_col = pl_col+1
    if pl_col == Ncol:
        pl_col = 0
        pl_row = pl_row+1
        
    BI = WordInfo_c['length']==wl
    meanDur = WordInfo_c[BI]['mean'].values.tolist()
    meanDur = [x for x in meanDur if str(x)!='nan']
    Omeandur[it] = np.mean(meanDur)
    meanStd = WordInfo_c[BI]['std'].values.tolist()
    meanStd = [x for x in meanStd if str(x)!='nan']    
    Ostddur[it] = np.mean(meanStd)
    sns.distplot(meanDur,ax=axs[-1]).set_title('{}'.format(str(wl)))
    plt.text(0.95, 0.9, '{:.3f}'.format(Omeandur[it])[1:], transform=axs[-1].transAxes, \
             horizontalalignment='right',verticalalignment='top')
    axs[-1].set_xlim([0,1.5])
    if it == len(unwl)-1:
        axs[-1].set_xlabel('duration (sec)')
plt.show() 
#fig.savefig(figloc + ID + '_WordLength.pdf', format='pdf')
#fig.savefig(figloc + ID + '_WordLength.jpg', format='jpg')

#%% amount of syllables:
unsl = [1,2,3,4,5,6] # word lengths to plot
axs = list()
fig = plt.figure(constrained_layout=True, figsize = (bfi.figsize.Col2,7))
gs = fig.add_gridspec(4, Ncol)
pl_row = 0
pl_col = 0
OmeandurS = np.zeros(len(unsl))
OstddurS = np.zeros(len(unsl))
for it,sl in enumerate(unsl):
    axs.append(fig.add_subplot(gs[pl_row,pl_col]))
    pl_col = pl_col+1
    if pl_col == Ncol:
        pl_col = 0
        pl_row = pl_row+1
        
    BI = WordInfo_c['Nsyl']==sl
    meanDurS = WordInfo_c[BI]['mean'].values.tolist()
    meanDurS = [x for x in meanDurS if str(x)!='nan']
    OmeandurS[it] = np.mean(meanDurS)
    meanStdS = WordInfo_c[BI]['std'].values.tolist()
    meanStdS = [x for x in meanStdS if str(x)!='nan']    
    OstddurS[it] = np.mean(meanStd)
    sns.distplot(meanDurS,ax=axs[-1]).set_title('{}'.format(str(sl)))
    plt.text(0.95, 0.9, '{:.3f}'.format(OmeandurS[it])[1:], transform=axs[-1].transAxes, \
             horizontalalignment='right',verticalalignment='top')
    axs[-1].set_xlim([0,1.5])
    if it == len(unsl)-1:
        axs[-1].set_xlabel('duration (sec)')
plt.show() 
#fig.savefig(figloc + ID + '_SyllLength.pdf', format='pdf')   
#fig.savefig(figloc + ID + '_SyllLength.jpg', format='jpg')     

#%% Figure 2D
axs = list()
fig = plt.figure(constrained_layout=True, figsize = (bfi.figsize.Col2,2)) 
gs = fig.add_gridspec(1, 4)
listval = [unwl, unsl]
names = ['letters','syllables']
colnames = ['length','Nsyl']
AmeanDur = [Omeandur]+[OmeandurS]
ls_sm = list()
cntpl = 0

def doplot(cntpl, length, meanDur, val, Adur, rh, p, tit):
    axs.append(fig.add_subplot(gs[0,cntpl]))
    sns.violinplot(x=length,y=meanDur, ax=axs[-1], inner = None, hue = length>0); axs[-1].scatter(val,Adur,c=cs.cols[1],s=10)
    if it == 0:
        axs[-1].set_xticks(np.arange(1,15,2));
        axs[-1].set_xticklabels(np.arange(2,15,2))
    axs[-1].set_xlabel('N ' + names[it]); axs[-1].set_ylabel(tit); 
    axs[-1].set_title('\u03C1={:.3f}, p={:.3f}'.format(rh,p))
    axs[-1].get_legend().set_visible(False)

for it in range(2):
    # word length
    BI = np.isin(np.array(WordInfo_c[colnames[it]]), listval[it])
    meanDur = np.array(WordInfo_c[BI]['mean'].values.tolist())
    meanStd = np.array(WordInfo_c[BI]['std'].values.tolist())
    itNN = [it for it,x in enumerate(meanDur) if str(x)!='nan']
    meanDur = meanDur[itNN]
    meanStd = meanStd[itNN]
    length = np.array(WordInfo_c[BI][colnames[it]].values.tolist())
    length = length[itNN]    
    rhstd,p = scipy.stats.pearsonr(length,meanStd)
    rh,p = scipy.stats.pearsonr(length,meanDur)
    doplot(cntpl, length, meanDur, np.array(listval[it])-listval[it][0], AmeanDur[it], rh, p, 'duration')
    
    axs.append(fig.add_subplot(gs[0,cntpl+1]))
    rate = 1/(meanDur/length)
    rh,p = scipy.stats.pearsonr(length,rate)
    doplot(cntpl+1, length, rate, np.array(listval[it])-listval[it][0], 1/(AmeanDur[it]/np.array(listval[it])), rh, p, 'rate')
    
    subdata = WordInfo_c[BI]
    ls_sm.append(ols('mean ~ ' + colnames[it], data = subdata).fit())
    cntpl = cntpl + 2
plt.show()
#fig.savefig(figloc + ID + '_RateEff.pdf', format='pdf')  
#fig.savefig(figloc + ID + '_RateEff.jpg', format='jpg')  

ls_sm[0].summary()
ls_sm[1].summary()

#%% same, but for word frequency
BI = np.array(WordInfo_c['Freq']) > 0
meanDurWF = np.array(WordInfo_c[BI]['mean'].values.tolist())
meanStdWF = np.array(WordInfo_c[BI]['std'].values.tolist())
itNN = [it for it,x in enumerate(meanDurWF) if str(x)!='nan']
meanDurWF = meanDurWF[itNN]
meanStdWF = meanStdWF[itNN]
lengthWF = np.array(WordInfo_c[BI]['Freq'].values.tolist())
lengthWF = lengthWF[itNN]
rh,p = scipy.stats.pearsonr(lengthWF,meanDurWF)
rhstd,p = scipy.stats.pearsonr(lengthWF,meanStdWF)
 
#%% Figure 2B
g = sns.jointplot(x= lengthWF,y = meanDurWF, kind = 'hex', size = bfi.figsize.Col2/4)
sns.regplot(lengthWF,meanDurWF, ax=g.ax_joint,scatter=False)
g.ax_joint.set_xlabel('word frequency ($x10^{6}$)');g.ax_joint.set_ylabel('duration');g.ax_marg_x.set_title('mean')
#g.savefig(figloc + ID + '_WordFreqMean.pdf', format='pdf')  
#g.savefig(figloc + ID + '_WordFreqMean.jpg', format='jpg')  

g = sns.jointplot(x= lengthWF,y = meanStdWF, kind = 'hex', size = bfi.figsize.Col2/4)
sns.regplot(lengthWF,meanStdWF, ax=g.ax_joint,scatter=False)
g.ax_joint.set_xlabel('word frequency ($x10^{6}$)');g.ax_joint.set_ylabel('duration');g.ax_marg_x.set_title('standard deviation')
#g.savefig(figloc + ID + '_WordFreqStd.pdf', format='pdf')  
#g.savefig(figloc + ID + '_WordFreqStd.jpg', format='jpg')  

#%% Figure 2C
SL = 1
sylam = ['mono-syllabic words','bi-syllabic words']
BI = np.logical_and(np.array(WordInfo_c['Nsyl']) == SL, np.array(WordInfo_c['Freq']) > 0)
meanDurWFS = np.array(WordInfo_c[BI]['mean'].values.tolist())
meanStdWFS = np.array(WordInfo_c[BI]['std'].values.tolist())
itNN = [it for it,x in enumerate(meanDurWFS) if str(x)!='nan']
meanDurWFS = meanDurWFS[itNN]
meanStdWFS = meanStdWFS[itNN]
lengthWFS = np.array(WordInfo_c[BI]['Freq'].values.tolist())
lengthWFS = lengthWFS[itNN]

g = sns.jointplot(x= lengthWFS,y = meanDurWFS, kind = 'hex', size = bfi.figsize.Col2/4)
sns.regplot(lengthWFS,meanDurWFS, ax=g.ax_joint,scatter=False)
g.ax_joint.set_xlabel('word frequency ($x10^{6}$)');g.ax_joint.set_ylabel('duration');g.ax_marg_x.set_title(sylam[SL-1])
#g.savefig(figloc + ID + '_WordFreqMean_SL' + str(SL) + '.pdf', format='pdf')  
#g.savefig(figloc + ID + '_WordFreqMean_SL' + str(SL) +'.jpg', format='jpg')  
g = sns.jointplot(x= lengthWFS,y = meanStdWFS, kind = 'hex', size = bfi.figsize.Col2/4)
sns.regplot(lengthWFS,meanStdWFS, ax=g.ax_joint,scatter=False)
g.ax_joint.set_xlabel('word frequency ($x10^{6}$)');g.ax_joint.set_ylabel('duration');g.ax_marg_x.set_title(sylam[SL-1])
#g.savefig(figloc + ID + '_WordFreqStd_SL' + str(SL) + '.pdf', format='pdf')  
#g.savefig(figloc + ID + '_WordFreqStd_SL' + str(SL) + '.jpg', format='jpg')