In [1]:
import pickle
import re
import pandas as pd
import numpy as np

# for the stats
import pymc as pm
import bambi as bmb
import pytensor.tensor as pt
from scipy.special import expit, logit
import statsmodels.api as sm

# for plotting
import seaborn as sns
import matplotlib.pyplot as plt
import arviz as az

This is the notebook with the regression analyses for the paper on understanding and measuring disagreements in bird taxonomy. The notebook starts by cleaning and combining data. Those data are then used for the analysis. If you want to run the analysis yourself, you can download the datafile from zenodo, and use that instead of the data-cleaning cells at the start of the notebook (10.5281/zenodo.10078307, file TD_EffortStudy_data.csv).

The traces that were reported in the paper were also uploaded in the same dataset on Zenodo. If you don't want to run the models yourself, you can download them on zenodo in the .netcdf file format with the az.from_netcdf('...').

Note: the DAG used to build all the models in this notebook is described in the manuscript submitted to EJT.

In [16]:
# versions

print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))
re==2.2.1
pandas==1.5.3
numpy==1.24.2
pymc==5.1.2
bambi==0.10.0
statsmodels.api==0.13.5
seaborn==0.12.2
arviz==0.15.1
In [2]:
SEED = 201288
In [128]:
## plotting functions

# function for getting the probabilities

def pordlog(a):
    
    # transform back to cumulative probabilites
    pa = expit(a)
    p_cum = np.concatenate(([0.], pa, [1.]))
    
    # get the intervals instead of the cumulative probs
    return p_cum[1:] - p_cum[:-1]

# function for plotting the posterior predictive proportions of ordinal scores

def ordinal_plot(nscores, df, title, labels,axis):
    colors = ['r','b','grey','g','c']
    
    for i in range(nscores):


        sns.scatterplot(x=new_hits,
                        y=df.iloc[i],
                        alpha = 0.5,
                        color = colors[i],
                        label = labels[i],
                        ax=ax[axis])
    ax[axis].legend()
    ax[axis].set_title(title)
    
    
# function for creating a stackplot of the proportions

def plot_probabilities2(ncuts, trace,cuts,name_lin,title,labels,axis,ylim,xrange):
    logit = cuts - trace.predictions[name_lin]
    logit_mean = logit.mean(dim = ['chain','draw'])
    probabilities = np.array([pordlog(logit_mean[:,i].values) for i in range(len(xrange))])
    ax[axis].stackplot(xrange, probabilities.T)
    ax[axis].set_title(title, fontsize = 17)
    ax[axis].legend(labels,loc = 'lower right')
    ax[axis].set_ylim(ymin = ylim, ymax = 1)
    ax[axis].set_ylabel('Proportion of total concepts', fontsize = 14)
    
    

Data¶

In [6]:
# disagreement data

conflicts = ['dc_recoded','scdr']
c_dct = {'class_bin':'Taxonomic conflict','dc_bin':'Concept conflict','scdr_bin':'Rank conflict', 'classif_recoded':'Taxonomic conflict','dc_recoded':'Concept conflict', 'scdr':'Rank conflict'}

df = pd.read_csv('...', sep = ";")
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# rename conflict to more meaningful names

df.rename(columns = {'dc_recoded':'concept_ord','scdr':'rank_ord'}, inplace=True)
conflicts = ['concept_ord','rank_ord']

# create df of only BL species

dfl = df.loc[df.in_bl == 1].copy()

print(df.shape)
print(dfl.shape)

dfl.head()
(12730, 108)
(10999, 108)
Out[6]:
avibase_id birdlife_id tsn eol_page_id iucn_name in_bl genus_bl name_bl family_bl sub_bl ... genus_conflict ep_conflict nom_conflict nom_agr nom_all_dif nom_bin ep_bin gen_bin concept_ord classif_recoded
0 95A724BEBE54EED7 Euplectes diadematus 560040.0 45515027.0 22719168.0 1.0 Euplectes diadematus NaN 0.0 ... 0 0 0 6 0 0 0 0 0 0
1 D4162FE44AF93BB6 Centropus bernsteini 554754.0 45517807.0 22684215.0 1.0 Centropus bernsteini Cuculidae 0.0 ... 0 0 0 6 0 0 0 0 0 0
2 96819591FD94B905 Phylloscopus fuscatus 179846.0 45510049.0 22715264.0 1.0 Phylloscopus fuscatus Phylloscopidae 0.0 ... 0 0 0 3 0 0 0 0 1 1
1734 CFFF429637FFF836 Oreotrochilus stolzmanni 693766.0 1268587.0 NaN 1.0 Oreotrochilus stolzmanni Trochilidae 0.0 ... 0 0 0 3 0 0 0 0 0 1
1735 1C2F4E80D146FA22 Oxyura maccoa 175180.0 1048502.0 22679820.0 1.0 Oxyura maccoa NaN 0.0 ... 0 0 0 6 0 0 0 0 0 0

5 rows × 108 columns

In [8]:
# load the google and wos data

effort_df = pd.read_csv('...')
effort_df.drop(['Unnamed: 0'], axis=1, inplace=True)

effort_df.head()
Out[8]:
avibase_id tot_conflict tax_conflict nom_conflict hits1 hits2 hits3 hits4 hits5 wos_hits genus_bl family_bl_all order_bl_all
0 95A724BEBE54EED7 0.0 0.0 0.0 2710 2470 2940 2460 2570 1.0 Euplectes Ploceidae passeriformes
1 D4162FE44AF93BB6 0.0 0.0 0.0 2470 2780 2490 3040 3530 0.0 Centropus Cuculidae cuculiformes
2 96819591FD94B905 3.0 3.0 0.0 40400 39000 39000 43600 39900 38.0 Phylloscopus Phylloscopidae passeriformes
3 CFFF429637FFF836 3.0 3.0 0.0 2340 2180 2100 2230 1990 1.0 Oreotrochilus Trochilidae caprimulgiformes
4 1C2F4E80D146FA22 0.0 0.0 0.0 12300 12300 12700 12400 12400 3.0 Oxyura Anatidae anseriformes
In [10]:
# load the gbif data

with open('...', "rb") as input_file:
    gbif = pickle.load(input_file)
    
gbif_df = pd.DataFrame(gbif, index = ['gbif']).T
gbif_df.reset_index(inplace=True)
gbif_df.columns = ['birdlife_id','gbif']
gbif_df['birdlife_id'] = [i.capitalize() for i in gbif_df.birdlife_id.values]


gbif_df.to_csv('...')

NOTE: The file created in the cell below is the datafile that has been uploaded to zenodo (TD_EffortStudy_data.csv). If you want to repeat the analysis, just use that file.

In [15]:
## make dataframe with all relevant columns


# select cols

dfcols = ['avibase_id','birdlife_id'] + conflicts 
effortcols = ['avibase_id','hits1','hits2', 'hits3', 'hits4', 'hits5', 'wos_hits']

# merge with google and wos

dfx = dfl[dfcols].merge(effort_df[effortcols], on = 'avibase_id',how = 'left')

# merge with gbif occurences

dfx = dfx.merge(gbif_df, on = 'birdlife_id',how = 'left')

## create columns needed for analysis

# gbif occurences: model zeros separately, and take log10 of the nonzero values

dfx['gbif_zero'] = np.where(dfx.gbif == 0, 1, 0)
dfx['gbif_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.gbif.values]

# # idem for wos

dfx['wos_zero'] = np.where(dfx.wos_hits == 0, 1, 0)
dfx['wos_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.wos_hits.values]

# # take mean of the google hits, and then log10

dfx['google_mean'] = dfx[['hits1','hits2', 'hits3', 'hits4', 'hits5']].mean(axis = 1)
dfx['loggoogle'] = np.log10(dfx.google_mean)


dfx.to_csv('...')

dfx.head()
Out[15]:
avibase_id birdlife_id concept_ord rank_ord hits1 hits2 hits3 hits4 hits5 wos_hits gbif gbif_zero gbif_lognoneg wos_zero wos_lognoneg google_mean loggoogle
0 95A724BEBE54EED7 Euplectes diadematus 0 0 2710 2470 2940 2460 2570 1.0 333.0 0 2.522444 0 0.000000 2630.0 3.419956
1 D4162FE44AF93BB6 Centropus bernsteini 0 0 2470 2780 2490 3040 3530 0.0 746.0 0 2.872739 1 0.000000 2862.0 3.456670
2 96819591FD94B905 Phylloscopus fuscatus 1 0 40400 39000 39000 43600 39900 38.0 87984.0 0 4.944404 0 1.579784 40380.0 4.606166
3 CFFF429637FFF836 Oreotrochilus stolzmanni 0 3 2340 2180 2100 2230 1990 1.0 636.0 0 2.803457 0 0.000000 2168.0 3.336059
4 1C2F4E80D146FA22 Oxyura maccoa 0 0 12300 12300 12700 12400 12400 3.0 15453.0 0 4.189013 0 0.477121 12420.0 4.094122
In [16]:
# check and remove nans

withnans = len(dfx)

dfx = dfx.dropna()

print(f'dropped {withnans - len(dfx)} concepts with missing data')
dropped 4 concepts with missing data
In [17]:
# correlations

dfx[['loggoogle','wos_lognoneg','gbif_lognoneg']].corr().style.background_gradient(cmap='coolwarm')
Out[17]:
  loggoogle wos_lognoneg gbif_lognoneg
loggoogle 1.000000 0.756405 0.726334
wos_lognoneg 0.756405 1.000000 0.570808
gbif_lognoneg 0.726334 0.570808 1.000000

simulate data¶

All seems to work fine when we simulate this

In [521]:
n = 1000
prob0 = 0.1
x = np.random.normal(0,1,int(n*1-prob0))
x0 = np.zeros(int(n*prob0))
xdata = np.concatenate([x,x0])
zerodata = np.concatenate([np.zeros(int(n*1-prob0)), np.ones(int(n*prob0))])
true_counts = -0.8
true_zeros = 1.5

c1 = 2
c2 = 3
c3 = 3.3

logodds1 = c1 - (true_counts*xdata + true_zeros*zerodata)
logodds2 = c2 - (true_counts*xdata + true_zeros*zerodata)
logodds3 = c3 - (true_counts*xdata + true_zeros*zerodata)

prob1 = expit(logodds1)
prob2 = expit(logodds2)
prob3 = expit(logodds3)

p1 = prob1
p2 = prob2 - prob1
p3 = prob3 - prob2
p4 = 1 - prob3

obsprobs = np.array([p1,p2,p3,p4]).T
y = [np.random.choice(4, 1, p = list(i))[0] for i in obsprobs]
In [522]:
df = pd.DataFrame([xdata,zerodata,y, obsprobs[:,0],obsprobs[:,1],obsprobs[:,2],obsprobs[:,3]], index = ['x','zeros','y','p1','p2','p3','p4']).T
df.head()
Out[522]:
x zeros y p1 p2 p3 p4
0 1.482865 0.0 0.0 0.960315 0.024710 0.003838 0.011137
1 -0.378768 0.0 0.0 0.845141 0.091708 0.015589 0.047562
2 -0.978175 0.0 3.0 0.771616 0.130190 0.023550 0.074643
3 0.390296 0.0 0.0 0.909885 0.054961 0.008872 0.026282
4 0.309677 0.0 0.0 0.904456 0.058136 0.009424 0.027984
In [524]:
with pm.Model() as simul_ordinal:

    # data

    data = pm.MutableData('data', xdata)
    zerodatas = pm.MutableData('zerodatas', zerodata)

    cutpoints = pm.Normal('cutpoints',
                           mu=[0,1,2],
                           sigma=5,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 3,
                           )
    
    counts = pm.Normal('counts',0,5)
    zeros = pm.Normal('zeros',0,5)
    
    p = pm.Deterministic('p',counts * data + zeros * zerodatas) 

    y_out = pm.OrderedLogistic("y_out", p, cutpoints, observed=y,compute_p = True)

    trace = pm.sample()
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cutpoints, counts, zeros]
100.00% [8000/8000 01:12<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 98 seconds.
In [525]:
az.summary(trace, var_names = ['zeros','counts','cutpoints'])
Out[525]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
zeros 1.463 0.222 1.067 1.901 0.004 0.003 2941.0 3078.0 1.0
counts -0.761 0.103 -0.954 -0.568 0.002 0.001 3291.0 3127.0 1.0
cutpoints[0] 1.963 0.102 1.768 2.152 0.002 0.001 2647.0 2822.0 1.0
cutpoints[1] 2.991 0.138 2.740 3.264 0.003 0.002 3034.0 3040.0 1.0
cutpoints[2] 3.270 0.151 2.982 3.549 0.003 0.002 3118.0 2974.0 1.0

Web of Science effort¶

The technique of separately modeling the zeros is discussed in (Hosmer & Lemeshow's[https://www.amazon.com/dp/0471356328]) book on logistic regression.

In [48]:
plt.hist(dfx.wos_lognoneg, bins = 30)
plt.show()
In [49]:
names = ['rank','concept']
coords = {'conflicts':names}
In [52]:
with pm.Model(coords=coords) as wos_m_ordinal:
    wos_data = pm.MutableData('wos_data', dfx.wos_lognoneg.values)
    wos_zeros = pm.MutableData('wos_zeros', dfx.wos_zero.values)
    
    counts = pm.Normal('counts',0,1, dims = 'conflicts')
    
    zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
    
     # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )
    
    p_rank = pm.Deterministic('p_rank',counts[0] * wos_data + zeros[0] * wos_zeros)
    p_concept = pm.Deterministic('p_concept',counts[1] * wos_data + zeros[1] * wos_zeros)
    
    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)

    pr = pm.sample_prior_predictive()
    
Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]
In [53]:
with wos_m_ordinal:
    trace_wos_ordinal = pm.sample(5000, return_inferencedata = True)
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [counts, zeros, cutpoints_rank, cutpoints_concept]
100.00% [24000/24000 14:27<00:00 Sampling 4 chains, 0 divergences]
Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 900 seconds.
In [54]:
az.summary(trace_wos_ordinal, var_names = ['counts','zeros','cutpoints_concept','cutpoints_rank'])
Out[54]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
counts[rank] -0.375 0.068 -0.503 -0.248 0.001 0.000 12736.0 15484.0 1.0
counts[concept] 0.321 0.046 0.235 0.409 0.000 0.000 13972.0 15152.0 1.0
zeros[rank] 0.632 0.080 0.486 0.786 0.001 0.001 12287.0 14311.0 1.0
zeros[concept] 0.016 0.087 -0.152 0.175 0.001 0.001 15476.0 15422.0 1.0
cutpoints_concept[0] 2.397 0.060 2.284 2.509 0.001 0.000 13510.0 15056.0 1.0
cutpoints_concept[1] 4.086 0.084 3.930 4.246 0.001 0.000 18292.0 15989.0 1.0
cutpoints_rank[0] 2.181 0.066 2.057 2.306 0.001 0.000 11180.0 13530.0 1.0
cutpoints_rank[1] 2.193 0.066 2.068 2.317 0.001 0.000 11211.0 13826.0 1.0
cutpoints_rank[2] 2.229 0.067 2.103 2.354 0.001 0.000 11229.0 13724.0 1.0
cutpoints_rank[3] 4.383 0.103 4.192 4.579 0.001 0.001 16063.0 14072.0 1.0
In [132]:
az.to_netcdf(trace_wos_ordinal, '...')
In [55]:
fig, ax = plt.subplots(figsize = (10,4))

az.plot_forest(trace_wos_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)
Out[55]:
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
In [88]:
# posterior predictive checks: outcomes

## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])

# now sample from the model with this hypothetical data

with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros}) 
        ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros}) 
        ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [y_rank]
100.00% [4000/4000 02:42<00:00]
Sampling: [y_concept]
100.00% [4000/4000 02:52<00:00]
In [89]:
# posterior predictive checks: outcomes

new_hits2_w = np.linspace(0,4,100)
new_zeros2_w = np.concatenate([[1], np.zeros(99, dtype = 'int32')])



with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w}) 
        thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
        ppc_rank2_wos = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w}) 
        thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept2_wos = pm.sample_posterior_predictive(trace_wos_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
100.00% [2000/2000 00:00<00:00]
Sampling: []
100.00% [20000/20000 00:00<00:00]
In [90]:
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_wos = trace_wos_ordinal.posterior['cutpoints_concept']
cuts_rank_wos = trace_wos_ordinal.posterior['cutpoints_rank']


plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_wos,
                    cuts_concept_wos,'p_concept','Concept conflict',
                    ['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
                    0,0.7,new_hits2_w)
cuts_rank = trace_wos_ordinal.posterior['cutpoints_rank']

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_wos,
                    cuts_rank_wos,'p_rank','Rank conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
                    1,0.7,new_hits2_w)
In [91]:
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 800
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 800
In [92]:
# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()

GBIF occurences¶

In [95]:
plt.hist(dfx.gbif_lognoneg, bins = 30)

plt.show()
In [97]:
with pm.Model(coords=coords) as gbif_m_ordinal:
    gbif_data = pm.MutableData('gbif_data', dfx.gbif_lognoneg.values)
    gbif_zeros = pm.MutableData('gbif_zeros', dfx.gbif_zero.values)
    
    counts = pm.Normal('counts',0,1, dims = 'conflicts')
    
    zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
    
     # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )
    
    p_rank = pm.Deterministic('p_rank',counts[0] * gbif_data + zeros[0] * gbif_zeros)
    p_concept = pm.Deterministic('p_concept',counts[1] * gbif_data + zeros[1] * gbif_zeros)
    
    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)

    pr = pm.sample_prior_predictive()
    
Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]
In [133]:
with gbif_m_ordinal:
    trace_gbif_ordinal = pm.sample(5000, return_inferencedata = True)

az.to_netcdf(trace_gbif_ordinal, '...')
In [99]:
az.summary(trace_gbif_ordinal, var_names = ['counts','zeros','cutpoints_rank','cutpoints_concept'])
Out[99]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
counts[rank] -0.688 0.028 -0.742 -0.637 0.000 0.000 17152.0 14309.0 1.0
counts[concept] 0.078 0.025 0.032 0.125 0.000 0.000 15108.0 12877.0 1.0
zeros[rank] 0.232 0.143 -0.038 0.495 0.001 0.001 20608.0 16666.0 1.0
zeros[concept] 0.137 0.232 -0.306 0.571 0.002 0.001 22212.0 15791.0 1.0
cutpoints_rank[0] 0.094 0.085 -0.072 0.245 0.001 0.001 16693.0 13783.0 1.0
cutpoints_rank[1] 0.107 0.085 -0.056 0.261 0.001 0.001 16777.0 13825.0 1.0
cutpoints_rank[2] 0.147 0.085 -0.017 0.301 0.001 0.001 16917.0 14070.0 1.0
cutpoints_rank[3] 2.441 0.112 2.236 2.654 0.001 0.001 19981.0 15381.0 1.0
cutpoints_concept[0] 2.445 0.096 2.264 2.623 0.001 0.001 15189.0 13053.0 1.0
cutpoints_concept[1] 4.128 0.112 3.917 4.338 0.001 0.001 17088.0 13524.0 1.0
In [100]:
fig, ax = plt.subplots(figsize = (10,4))

az.plot_forest(trace_gbif_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)
Out[100]:
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
In [101]:
# posterior predictive checks: outcomes

## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])

with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros}) 
        ppc_rank = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros}) 
        ppc_concept = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [y_rank]
100.00% [20000/20000 15:46<00:00]
Sampling: [y_concept]
100.00% [20000/20000 15:37<00:00]
In [102]:
# posterior predictive checks: outcomes

new_hits2_gb = np.linspace(0,4,100)
new_zeros2_gb = np.concatenate([[1], np.zeros(99, dtype = 'int32')])

with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb}) 
        thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
        ppc_rank2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb}) 
        thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
100.00% [2000/2000 00:00<00:00]
Sampling: []
100.00% [2000/2000 00:00<00:00]
In [103]:
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400
In [104]:
# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()
In [105]:
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_gbif = trace_gbif_ordinal.posterior['cutpoints_concept']
cuts_rank_gbif = trace_gbif_ordinal.posterior['cutpoints_rank']


plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_gbif,
                    cuts_concept_gbif,'p_concept','Concept conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree'],
                    0,0.5,new_hits2_gb)

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_gbif,
                    cuts_rank_gbif,'p_rank','Rank conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
                    1,0.3,new_hits2_gb)

Google hits¶

In [106]:
plt.hist(np.log10(dfx.google_mean), bins = 30)
plt.show()
In [107]:
with pm.Model(coords=coords) as google_ordinal:

    # data

    google_data = pm.MutableData('google_data', dfx.loggoogle.values)
    

    
    # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )

    
    googlecounts = pm.Normal('googlecounts',0,1, dims = 'conflicts')
    
    
    p_rank = pm.Deterministic('p_rank',googlecounts[0] * google_data )
    p_concept = pm.Deterministic('p_concept',googlecounts[1] * google_data) 
    
    # don't compute and store in the trace the inferred probabilities of each categories, based on the cutpoints’ values
    # it takes too much memory; set compute_p to False

    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)
In [134]:
with google_ordinal:
    trace_google_ordinal = pm.sample(5000)

az.to_netcdf(trace_google_ordinal, '...')
In [109]:
az.summary(trace_google_ordinal, var_names = ['googlecounts','cutpoints_rank','cutpoints_concept'])
Out[109]:
mean sd hdi_3% hdi_97% mcse_mean mcse_sd ess_bulk ess_tail r_hat
googlecounts[rank] -1.767 0.053 -1.869 -1.671 0.000 0.000 14747.0 13723.0 1.0
googlecounts[concept] 0.168 0.048 0.079 0.259 0.000 0.000 17223.0 12776.0 1.0
cutpoints_rank[0] -4.232 0.184 -4.588 -3.901 0.002 0.001 14711.0 13560.0 1.0
cutpoints_rank[1] -4.217 0.184 -4.560 -3.873 0.002 0.001 14698.0 13533.0 1.0
cutpoints_rank[2] -4.175 0.184 -4.529 -3.845 0.002 0.001 14718.0 13399.0 1.0
cutpoints_rank[3] -1.747 0.186 -2.110 -1.414 0.001 0.001 16830.0 14594.0 1.0
cutpoints_concept[0] 2.818 0.191 2.456 3.173 0.001 0.001 17198.0 12879.0 1.0
cutpoints_concept[1] 4.501 0.199 4.122 4.870 0.001 0.001 17859.0 13765.0 1.0
In [110]:
fig, ax = plt.subplots(figsize = (6,3))

az.plot_forest(trace_google_ordinal, var_names = 'googlecounts',combined = True,ax=ax)
Out[110]:
array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)
In [111]:
# posterior predictive checks: outcomes

thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]

with google_ordinal:
        pm.set_data({"google_data":new_hits}) 
        ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with google_ordinal:
        pm.set_data({"google_data":new_hits})
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)
Sampling: [googlecounts, y_rank]
100.00% [4000/4000 02:45<00:00]
Sampling: [y_concept]
100.00% [2000/2000 01:18<00:00]
In [112]:
# posterior predictive checks: outcomes

new_hits2_g = np.linspace(2,6,100)

with google_ordinal:
        pm.set_data({"google_data":new_hits2_g}) 
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
        ppc_rank2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with google_ordinal:
        pm.set_data({"google_data":new_hits2_g}) 
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
        ppc_concept2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)
Sampling: []
100.00% [20000/20000 00:00<00:00]
Sampling: []
100.00% [20000/20000 00:00<00:00]
In [113]:
# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400
In [114]:
# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()
In [115]:
fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_g = trace_google_ordinal.posterior['cutpoints_concept']
cuts_rank_g = trace_google_ordinal.posterior['cutpoints_rank']

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_g,
                    cuts_concept_g,'p_concept','Concept conflict',
                    ['No conflict','1 vs 3','2 vs 2'],
                    0,0.5,new_hits2_g)

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_g,
                    cuts_rank_g,'p_rank','Rank conflict',
                    ['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
                    1,0.2,new_hits2_g)

Combined figure¶

In [131]:
# rank

fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True,sharex = False)


plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_wos,
                    cuts_rank_wos,'p_rank','WoS hits',
                    [],
                    0,0.2,new_hits2_w)



plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_gbif,
                    cuts_rank_gbif,'p_rank','GBIF occurences',
                    ['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
                    1,0.2,new_hits2_gb)


plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_g,
                    cuts_rank_g,'p_rank','Google hits',
                    [],
                    2,0.2,new_hits2_g)

ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()

ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')
# plt.suptitle('Rank conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)

plt.show()
In [130]:
# concept
fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_wos,
                    cuts_concept_wos,'p_concept','WoS hits',
                    [],
                    0,0.6,new_hits2_w)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_gbif,
                    cuts_concept_gbif,'p_concept','GBIF occurences',
                    ['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
                    1,0.6,new_hits2_gb)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_g,
                    cuts_concept_g,'p_concept','Google hits',
                    [],
                    2,0.6,new_hits2_g)
ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()

ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')

# plt.suptitle('Concept conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)

plt.show()
In [ ]: