import pickle
import re
import pandas as pd
import numpy as np

# for the stats
import pymc as pm
import bambi as bmb
import pytensor.tensor as pt
from scipy.special import expit, logit
import statsmodels.api as sm

# for plotting
import seaborn as sns
import matplotlib.pyplot as plt
import arviz as az


# versions

print('\n'.join(f'{m.__name__}=={m.__version__}' for m in globals().values() if getattr(m, '__version__', None)))

re==2.2.1
pandas==1.5.3
numpy==1.24.2
pymc==5.1.2
bambi==0.10.0
statsmodels.api==0.13.5
seaborn==0.12.2
arviz==0.15.1


SEED = 201288


## plotting functions

# function for getting the probabilities

def pordlog(a):
    
    # transform back to cumulative probabilites
    pa = expit(a)
    p_cum = np.concatenate(([0.], pa, [1.]))
    
    # get the intervals instead of the cumulative probs
    return p_cum[1:] - p_cum[:-1]

# function for plotting the posterior predictive proportions of ordinal scores

def ordinal_plot(nscores, df, title, labels,axis):
    colors = ['r','b','grey','g','c']
    
    for i in range(nscores):


        sns.scatterplot(x=new_hits,
                        y=df.iloc[i],
                        alpha = 0.5,
                        color = colors[i],
                        label = labels[i],
                        ax=ax[axis])
    ax[axis].legend()
    ax[axis].set_title(title)
    
    
# function for creating a stackplot of the proportions

def plot_probabilities2(ncuts, trace,cuts,name_lin,title,labels,axis,ylim,xrange):
    logit = cuts - trace.predictions[name_lin]
    logit_mean = logit.mean(dim = ['chain','draw'])
    probabilities = np.array([pordlog(logit_mean[:,i].values) for i in range(len(xrange))])
    ax[axis].stackplot(xrange, probabilities.T)
    ax[axis].set_title(title, fontsize = 17)
    ax[axis].legend(labels,loc = 'lower right')
    ax[axis].set_ylim(ymin = ylim, ymax = 1)
    ax[axis].set_ylabel('Proportion of total concepts', fontsize = 14)


# disagreement data

conflicts = ['dc_recoded','scdr']
c_dct = {'class_bin':'Taxonomic conflict','dc_bin':'Concept conflict','scdr_bin':'Rank conflict', 'classif_recoded':'Taxonomic conflict','dc_recoded':'Concept conflict', 'scdr':'Rank conflict'}

df = pd.read_csv('...', sep = ";")
df.drop(['Unnamed: 0'], axis=1, inplace=True)

# rename conflict to more meaningful names

df.rename(columns = {'dc_recoded':'concept_ord','scdr':'rank_ord'}, inplace=True)
conflicts = ['concept_ord','rank_ord']

# create df of only BL species

dfl = df.loc[df.in_bl == 1].copy()

print(df.shape)
print(dfl.shape)

dfl.head()

(12730, 108)
(10999, 108)


# load the google and wos data

effort_df = pd.read_csv('...')
effort_df.drop(['Unnamed: 0'], axis=1, inplace=True)

effort_df.head()


# load the gbif data

with open('...', "rb") as input_file:
    gbif = pickle.load(input_file)
    
gbif_df = pd.DataFrame(gbif, index = ['gbif']).T
gbif_df.reset_index(inplace=True)
gbif_df.columns = ['birdlife_id','gbif']
gbif_df['birdlife_id'] = [i.capitalize() for i in gbif_df.birdlife_id.values]


gbif_df.to_csv('...')


## make dataframe with all relevant columns


# select cols

dfcols = ['avibase_id','birdlife_id'] + conflicts 
effortcols = ['avibase_id','hits1','hits2', 'hits3', 'hits4', 'hits5', 'wos_hits']

# merge with google and wos

dfx = dfl[dfcols].merge(effort_df[effortcols], on = 'avibase_id',how = 'left')

# merge with gbif occurences

dfx = dfx.merge(gbif_df, on = 'birdlife_id',how = 'left')

## create columns needed for analysis

# gbif occurences: model zeros separately, and take log10 of the nonzero values

dfx['gbif_zero'] = np.where(dfx.gbif == 0, 1, 0)
dfx['gbif_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.gbif.values]

# # idem for wos

dfx['wos_zero'] = np.where(dfx.wos_hits == 0, 1, 0)
dfx['wos_lognoneg'] = [np.log10(i) if i > 0 else i for i in dfx.wos_hits.values]

# # take mean of the google hits, and then log10

dfx['google_mean'] = dfx[['hits1','hits2', 'hits3', 'hits4', 'hits5']].mean(axis = 1)
dfx['loggoogle'] = np.log10(dfx.google_mean)


dfx.to_csv('...')

dfx.head()


# check and remove nans

withnans = len(dfx)

dfx = dfx.dropna()

print(f'dropped {withnans - len(dfx)} concepts with missing data')

dropped 4 concepts with missing data


# correlations

dfx[['loggoogle','wos_lognoneg','gbif_lognoneg']].corr().style.background_gradient(cmap='coolwarm')


n = 1000
prob0 = 0.1
x = np.random.normal(0,1,int(n*1-prob0))
x0 = np.zeros(int(n*prob0))
xdata = np.concatenate([x,x0])
zerodata = np.concatenate([np.zeros(int(n*1-prob0)), np.ones(int(n*prob0))])
true_counts = -0.8
true_zeros = 1.5

c1 = 2
c2 = 3
c3 = 3.3

logodds1 = c1 - (true_counts*xdata + true_zeros*zerodata)
logodds2 = c2 - (true_counts*xdata + true_zeros*zerodata)
logodds3 = c3 - (true_counts*xdata + true_zeros*zerodata)

prob1 = expit(logodds1)
prob2 = expit(logodds2)
prob3 = expit(logodds3)

p1 = prob1
p2 = prob2 - prob1
p3 = prob3 - prob2
p4 = 1 - prob3

obsprobs = np.array([p1,p2,p3,p4]).T
y = [np.random.choice(4, 1, p = list(i))[0] for i in obsprobs]


df = pd.DataFrame([xdata,zerodata,y, obsprobs[:,0],obsprobs[:,1],obsprobs[:,2],obsprobs[:,3]], index = ['x','zeros','y','p1','p2','p3','p4']).T
df.head()


with pm.Model() as simul_ordinal:

    # data

    data = pm.MutableData('data', xdata)
    zerodatas = pm.MutableData('zerodatas', zerodata)

    cutpoints = pm.Normal('cutpoints',
                           mu=[0,1,2],
                           sigma=5,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 3,
                           )
    
    counts = pm.Normal('counts',0,5)
    zeros = pm.Normal('zeros',0,5)
    
    p = pm.Deterministic('p',counts * data + zeros * zerodatas) 

    y_out = pm.OrderedLogistic("y_out", p, cutpoints, observed=y,compute_p = True)

    trace = pm.sample()

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [cutpoints, counts, zeros]

Sampling 4 chains for 1_000 tune and 1_000 draw iterations (4_000 + 4_000 draws total) took 98 seconds.


az.summary(trace, var_names = ['zeros','counts','cutpoints'])


plt.hist(dfx.wos_lognoneg, bins = 30)
plt.show()


names = ['rank','concept']
coords = {'conflicts':names}


with pm.Model(coords=coords) as wos_m_ordinal:
    wos_data = pm.MutableData('wos_data', dfx.wos_lognoneg.values)
    wos_zeros = pm.MutableData('wos_zeros', dfx.wos_zero.values)
    
    counts = pm.Normal('counts',0,1, dims = 'conflicts')
    
    zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
    
     # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )
    
    p_rank = pm.Deterministic('p_rank',counts[0] * wos_data + zeros[0] * wos_zeros)
    p_concept = pm.Deterministic('p_concept',counts[1] * wos_data + zeros[1] * wos_zeros)
    
    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)

    pr = pm.sample_prior_predictive()

Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]


with wos_m_ordinal:
    trace_wos_ordinal = pm.sample(5000, return_inferencedata = True)

Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (4 chains in 4 jobs)
NUTS: [counts, zeros, cutpoints_rank, cutpoints_concept]

Sampling 4 chains for 1_000 tune and 5_000 draw iterations (4_000 + 20_000 draws total) took 900 seconds.


az.summary(trace_wos_ordinal, var_names = ['counts','zeros','cutpoints_concept','cutpoints_rank'])


az.to_netcdf(trace_wos_ordinal, '...')


fig, ax = plt.subplots(figsize = (10,4))

az.plot_forest(trace_wos_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)

array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)


# posterior predictive checks: outcomes

## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])

# now sample from the model with this hypothetical data

with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros}) 
        ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits, "wos_zeros" :new_zeros}) 
        ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)

Sampling: [y_rank]

Sampling: [y_concept]


# posterior predictive checks: outcomes

new_hits2_w = np.linspace(0,4,100)
new_zeros2_w = np.concatenate([[1], np.zeros(99, dtype = 'int32')])



with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w}) 
        thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
        ppc_rank2_wos = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with wos_m_ordinal:
        pm.set_data({"wos_data":new_hits2_w, "wos_zeros" :new_zeros2_w}) 
        thinned_trace = trace_wos_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept2_wos = pm.sample_posterior_predictive(trace_wos_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)

Sampling: []

Sampling: []


fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_wos = trace_wos_ordinal.posterior['cutpoints_concept']
cuts_rank_wos = trace_wos_ordinal.posterior['cutpoints_rank']


plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_wos,
                    cuts_concept_wos,'p_concept','Concept conflict',
                    ['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
                    0,0.7,new_hits2_w)
cuts_rank = trace_wos_ordinal.posterior['cutpoints_rank']

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_wos,
                    cuts_rank_wos,'p_rank','Rank conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
                    1,0.7,new_hits2_w)


# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 800
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 800


# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()


plt.hist(dfx.gbif_lognoneg, bins = 30)

plt.show()


with pm.Model(coords=coords) as gbif_m_ordinal:
    gbif_data = pm.MutableData('gbif_data', dfx.gbif_lognoneg.values)
    gbif_zeros = pm.MutableData('gbif_zeros', dfx.gbif_zero.values)
    
    counts = pm.Normal('counts',0,1, dims = 'conflicts')
    
    zeros = pm.Normal('zeros',0,1,dims = 'conflicts')
    
     # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )
    
    p_rank = pm.Deterministic('p_rank',counts[0] * gbif_data + zeros[0] * gbif_zeros)
    p_concept = pm.Deterministic('p_concept',counts[1] * gbif_data + zeros[1] * gbif_zeros)
    
    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)

    pr = pm.sample_prior_predictive()

Sampling: [counts, cutpoints_concept, cutpoints_rank, y_concept, y_rank, zeros]


with gbif_m_ordinal:
    trace_gbif_ordinal = pm.sample(5000, return_inferencedata = True)

az.to_netcdf(trace_gbif_ordinal, '...')


az.summary(trace_gbif_ordinal, var_names = ['counts','zeros','cutpoints_rank','cutpoints_concept'])


fig, ax = plt.subplots(figsize = (10,4))

az.plot_forest(trace_gbif_ordinal , var_names = ['counts','zeros'],combined = True, ax=ax)

array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)


# posterior predictive checks: outcomes

## create new data that increases gradually from 0 to 4 (because log10 scale), and that has both 0s and 1s for the zeros
thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]
new_zeros = np.concatenate([np.ones(int(n/128), dtype = 'int32'), np.zeros(n -int(n/128), dtype = 'int32')])

with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros}) 
        ppc_rank = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits, "gbif_zeros" :new_zeros}) 
        ppc_concept = pm.sample_posterior_predictive(trace_gbif_ordinal, progressbar = True, var_names = ['y_concept'], predictions = True)

Sampling: [y_rank]

Sampling: [y_concept]


# posterior predictive checks: outcomes

new_hits2_gb = np.linspace(0,4,100)
new_zeros2_gb = np.concatenate([[1], np.zeros(99, dtype = 'int32')])

with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb}) 
        thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
        ppc_rank2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with gbif_m_ordinal:
        pm.set_data({"gbif_data":new_hits2_gb, "gbif_zeros" :new_zeros2_gb}) 
        thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept2_gbif = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['p_concept'], predictions = True)

Sampling: []

Sampling: []


# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400


# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()


fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_gbif = trace_gbif_ordinal.posterior['cutpoints_concept']
cuts_rank_gbif = trace_gbif_ordinal.posterior['cutpoints_rank']


plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_gbif,
                    cuts_concept_gbif,'p_concept','Concept conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree'],
                    0,0.5,new_hits2_gb)

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_gbif,
                    cuts_rank_gbif,'p_rank','Rank conflict',
                    ['No conflict','1 lists disagrees','2 lists disagree','3 lists disagree','all disagree'],
                    1,0.3,new_hits2_gb)


plt.hist(np.log10(dfx.google_mean), bins = 30)
plt.show()


with pm.Model(coords=coords) as google_ordinal:

    # data

    google_data = pm.MutableData('google_data', dfx.loggoogle.values)
    

    
    # prior for the cutpoints
    
    
    cutpoints_rank = pm.Normal('cutpoints_rank',
                           mu=[0,1,2,3],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 4,
                           )
    
    cutpoints_concept = pm.Normal('cutpoints_concept',
                           mu=[0,1],
                           sigma=1,
                           transform=pm.distributions.transforms.univariate_ordered,
                           shape = 2,
                           )

    
    googlecounts = pm.Normal('googlecounts',0,1, dims = 'conflicts')
    
    
    p_rank = pm.Deterministic('p_rank',googlecounts[0] * google_data )
    p_concept = pm.Deterministic('p_concept',googlecounts[1] * google_data) 
    
    # don't compute and store in the trace the inferred probabilities of each categories, based on the cutpoints’ values
    # it takes too much memory; set compute_p to False

    y_rank = pm.OrderedLogistic("y_rank", p_rank, cutpoints_rank, observed=dfx.rank_ord,compute_p = False)
    y_concept = pm.OrderedLogistic("y_concept", p_concept, cutpoints_concept, observed=dfx.concept_ord,compute_p = False)


with google_ordinal:
    trace_google_ordinal = pm.sample(5000)

az.to_netcdf(trace_google_ordinal, '...')


az.summary(trace_google_ordinal, var_names = ['googlecounts','cutpoints_rank','cutpoints_concept'])


fig, ax = plt.subplots(figsize = (6,3))

az.plot_forest(trace_google_ordinal, var_names = 'googlecounts',combined = True,ax=ax)

array([<Axes: title={'center': '94.0% HDI'}>], dtype=object)


# posterior predictive checks: outcomes

thinned_trace = trace_gbif_ordinal.sel(draw=slice(None, None, 5))
n = thinned_trace.posterior['p_rank'].shape[-1]

new_hits = np.repeat(np.linspace(0,4,128),int(n/128) + 1)[:n]

with google_ordinal:
        pm.set_data({"google_data":new_hits}) 
        ppc_rank = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_rank'], predictions = True)
        
with google_ordinal:
        pm.set_data({"google_data":new_hits})
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 10))
        ppc_concept = pm.sample_posterior_predictive(thinned_trace, progressbar = True, var_names = ['y_concept'], predictions = True)

Sampling: [googlecounts, y_rank]

Sampling: [y_concept]


# posterior predictive checks: outcomes

new_hits2_g = np.linspace(2,6,100)

with google_ordinal:
        pm.set_data({"google_data":new_hits2_g}) 
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
        ppc_rank2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_rank'], predictions = True)
        
with google_ordinal:
        pm.set_data({"google_data":new_hits2_g}) 
        thinned_trace = trace_google_ordinal.sel(draw=slice(None, None, 5))
        ppc_concept2_g = pm.sample_posterior_predictive(trace_google_ordinal, progressbar = True, var_names = ['p_concept'], predictions = True)

Sampling: []

Sampling: []


# put the predictive samples in a dataframe with a row for each ordinal score and each draw as a column 
# (with each cell being the count of that ordinal score for that draw)
# then divide by the total samples for that draw to get the proportion of that score, 

concept_outcome_samples = pd.DataFrame(az.extract(ppc_concept.predictions['y_concept'])['y_concept'].values).T.apply(pd.Series.value_counts) / 400
rank_outcome_samples = pd.DataFrame(az.extract(ppc_rank.predictions['y_rank'])['y_rank'].values).T.apply(pd.Series.value_counts) / 400


# now plot these proportions by increasing wos hits


fig, ax = plt.subplots(ncols = 2, figsize = (10,4),sharey = True)


ordinal_plot(len(dfx.concept_ord.unique()), concept_outcome_samples, 'Concept conflict', 
            ['No concept conflict','Conflict score 1', 'Conflict score 2'],0)

ordinal_plot(len(dfx.rank_ord.unique()), rank_outcome_samples, 'Rank conflict', 
            ['No Rank conflict','Conflict score 1', 'Conflict score 2','Conflict score 3', 'Conflict score 4'],1)

plt.show()


fig, ax = plt.subplots(ncols = 2,figsize = (10,4))

cuts_concept_g = trace_google_ordinal.posterior['cutpoints_concept']
cuts_rank_g = trace_google_ordinal.posterior['cutpoints_rank']

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_g,
                    cuts_concept_g,'p_concept','Concept conflict',
                    ['No conflict','1 vs 3','2 vs 2'],
                    0,0.5,new_hits2_g)

plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_g,
                    cuts_rank_g,'p_rank','Rank conflict',
                    ['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
                    1,0.2,new_hits2_g)


# rank

fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True,sharex = False)


plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_wos,
                    cuts_rank_wos,'p_rank','WoS hits',
                    [],
                    0,0.2,new_hits2_w)



plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_gbif,
                    cuts_rank_gbif,'p_rank','GBIF occurences',
                    ['No conflict','1 rank case','2 rank cases','3 rank cases','4 rank cases'],
                    1,0.2,new_hits2_gb)


plot_probabilities2(len(dfx.rank_ord.unique()),
                    ppc_rank2_g,
                    cuts_rank_g,'p_rank','Google hits',
                    [],
                    2,0.2,new_hits2_g)

ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()

ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')
# plt.suptitle('Rank conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)

plt.show()


# concept
fig, ax = plt.subplots(ncols = 3,figsize = (14,4), sharey = True)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_wos,
                    cuts_concept_wos,'p_concept','WoS hits',
                    [],
                    0,0.6,new_hits2_w)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_gbif,
                    cuts_concept_gbif,'p_concept','GBIF occurences',
                    ['No conflict','3 vs. 1','2 vs 2 (or 2 vs 1 vs 1)'],
                    1,0.6,new_hits2_gb)

plot_probabilities2(len(dfx.concept_ord.unique()),
                    ppc_concept2_g,
                    cuts_concept_g,'p_concept','Google hits',
                    [],
                    2,0.6,new_hits2_g)
ax[0].set_xlabel('Log10 WoS hits', fontsize = 14)
ax[1].set_xlabel('Log10 GBIF occurences', fontsize = 14)
ax[2].set_xlabel('Log10 Google hits', fontsize = 14)
fig.tight_layout()

ax[1].set_ylabel(' ')
ax[2].set_ylabel(' ')

# plt.suptitle('Concept conflict by measure of effort', fontsize = 15, y=1.05)
fig.savefig('...', format='tif', dpi=600)

plt.show()

	avibase_id	birdlife_id	tsn	eol_page_id	iucn_name	in_bl	genus_bl	name_bl	family_bl	...	nom_agr	concept_ord	classif_recoded
0	95A724BEBE54EED7	Euplectes diadematus	560040.0	45515027.0	22719168.0	1.0	Euplectes	diadematus	NaN	...	6	0	0
1	D4162FE44AF93BB6	Centropus bernsteini	554754.0	45517807.0	22684215.0	1.0	Centropus	bernsteini	Cuculidae	...	6	0	0
2	96819591FD94B905	Phylloscopus fuscatus	179846.0	45510049.0	22715264.0	1.0	Phylloscopus	fuscatus	Phylloscopidae	...	3	1	1
1734	CFFF429637FFF836	Oreotrochilus stolzmanni	693766.0	1268587.0	NaN	1.0	Oreotrochilus	stolzmanni	Trochilidae	...	3	0	1
1735	1C2F4E80D146FA22	Oxyura maccoa	175180.0	1048502.0	22679820.0	1.0	Oxyura	maccoa	NaN	...	6	0	0

	avibase_id	tot_conflict	tax_conflict	hits1	hits2	hits3	hits4	hits5	wos_hits	genus_bl	family_bl_all	order_bl_all
0	95A724BEBE54EED7	0.0	0.0	2710	2470	2940	2460	2570	1.0	Euplectes	Ploceidae	passeriformes
1	D4162FE44AF93BB6	0.0	0.0	2470	2780	2490	3040	3530	0.0	Centropus	Cuculidae	cuculiformes
2	96819591FD94B905	3.0	3.0	40400	39000	39000	43600	39900	38.0	Phylloscopus	Phylloscopidae	passeriformes
3	CFFF429637FFF836	3.0	3.0	2340	2180	2100	2230	1990	1.0	Oreotrochilus	Trochilidae	caprimulgiformes
4	1C2F4E80D146FA22	0.0	0.0	12300	12300	12700	12400	12400	3.0	Oxyura	Anatidae	anseriformes

	avibase_id	birdlife_id	concept_ord	rank_ord	hits1	hits2	hits3	hits4	hits5	wos_hits	gbif	gbif_lognoneg	wos_zero	wos_lognoneg	google_mean	loggoogle
0	95A724BEBE54EED7	Euplectes diadematus	0	0	2710	2470	2940	2460	2570	1.0	333.0	2.522444	0	0.000000	2630.0	3.419956
1	D4162FE44AF93BB6	Centropus bernsteini	0	0	2470	2780	2490	3040	3530	0.0	746.0	2.872739	1	0.000000	2862.0	3.456670
2	96819591FD94B905	Phylloscopus fuscatus	1	0	40400	39000	39000	43600	39900	38.0	87984.0	4.944404	0	1.579784	40380.0	4.606166
3	CFFF429637FFF836	Oreotrochilus stolzmanni	0	3	2340	2180	2100	2230	1990	1.0	636.0	2.803457	0	0.000000	2168.0	3.336059
4	1C2F4E80D146FA22	Oxyura maccoa	0	0	12300	12300	12700	12400	12400	3.0	15453.0	4.189013	0	0.477121	12420.0	4.094122

	loggoogle	wos_lognoneg	gbif_lognoneg
loggoogle	1.000000	0.756405	0.726334
wos_lognoneg	0.756405	1.000000	0.570808
gbif_lognoneg	0.726334	0.570808	1.000000

	x	y	p1	p2	p3	p4
0	1.482865	0.0	0.960315	0.024710	0.003838	0.011137
1	-0.378768	0.0	0.845141	0.091708	0.015589	0.047562
2	-0.978175	3.0	0.771616	0.130190	0.023550	0.074643
3	0.390296	0.0	0.909885	0.054961	0.008872	0.026282
4	0.309677	0.0	0.904456	0.058136	0.009424	0.027984

Data¶

simulate data¶

Web of Science effort¶

GBIF occurences¶

Google hits¶

Combined figure¶

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
zeros	1.463	0.222	1.067	1.901	0.004	0.003	2941.0	3078.0	1.0
counts	-0.761	0.103	-0.954	-0.568	0.002	0.001	3291.0	3127.0	1.0
cutpoints[0]	1.963	0.102	1.768	2.152	0.002	0.001	2647.0	2822.0	1.0
cutpoints[1]	2.991	0.138	2.740	3.264	0.003	0.002	3034.0	3040.0	1.0
cutpoints[2]	3.270	0.151	2.982	3.549	0.003	0.002	3118.0	2974.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
counts[rank]	-0.375	0.068	-0.503	-0.248	0.001	0.000	12736.0	15484.0	1.0
counts[concept]	0.321	0.046	0.235	0.409	0.000	0.000	13972.0	15152.0	1.0
zeros[rank]	0.632	0.080	0.486	0.786	0.001	0.001	12287.0	14311.0	1.0
zeros[concept]	0.016	0.087	-0.152	0.175	0.001	0.001	15476.0	15422.0	1.0
cutpoints_concept[0]	2.397	0.060	2.284	2.509	0.001	0.000	13510.0	15056.0	1.0
cutpoints_concept[1]	4.086	0.084	3.930	4.246	0.001	0.000	18292.0	15989.0	1.0
cutpoints_rank[0]	2.181	0.066	2.057	2.306	0.001	0.000	11180.0	13530.0	1.0
cutpoints_rank[1]	2.193	0.066	2.068	2.317	0.001	0.000	11211.0	13826.0	1.0
cutpoints_rank[2]	2.229	0.067	2.103	2.354	0.001	0.000	11229.0	13724.0	1.0
cutpoints_rank[3]	4.383	0.103	4.192	4.579	0.001	0.001	16063.0	14072.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
counts[rank]	-0.688	0.028	-0.742	-0.637	0.000	0.000	17152.0	14309.0	1.0
counts[concept]	0.078	0.025	0.032	0.125	0.000	0.000	15108.0	12877.0	1.0
zeros[rank]	0.232	0.143	-0.038	0.495	0.001	0.001	20608.0	16666.0	1.0
zeros[concept]	0.137	0.232	-0.306	0.571	0.002	0.001	22212.0	15791.0	1.0
cutpoints_rank[0]	0.094	0.085	-0.072	0.245	0.001	0.001	16693.0	13783.0	1.0
cutpoints_rank[1]	0.107	0.085	-0.056	0.261	0.001	0.001	16777.0	13825.0	1.0
cutpoints_rank[2]	0.147	0.085	-0.017	0.301	0.001	0.001	16917.0	14070.0	1.0
cutpoints_rank[3]	2.441	0.112	2.236	2.654	0.001	0.001	19981.0	15381.0	1.0
cutpoints_concept[0]	2.445	0.096	2.264	2.623	0.001	0.001	15189.0	13053.0	1.0
cutpoints_concept[1]	4.128	0.112	3.917	4.338	0.001	0.001	17088.0	13524.0	1.0

	mean	sd	hdi_3%	hdi_97%	mcse_mean	mcse_sd	ess_bulk	ess_tail	r_hat
googlecounts[rank]	-1.767	0.053	-1.869	-1.671	0.000	0.000	14747.0	13723.0	1.0
googlecounts[concept]	0.168	0.048	0.079	0.259	0.000	0.000	17223.0	12776.0	1.0
cutpoints_rank[0]	-4.232	0.184	-4.588	-3.901	0.002	0.001	14711.0	13560.0	1.0
cutpoints_rank[1]	-4.217	0.184	-4.560	-3.873	0.002	0.001	14698.0	13533.0	1.0
cutpoints_rank[2]	-4.175	0.184	-4.529	-3.845	0.002	0.001	14718.0	13399.0	1.0
cutpoints_rank[3]	-1.747	0.186	-2.110	-1.414	0.001	0.001	16830.0	14594.0	1.0
cutpoints_concept[0]	2.818	0.191	2.456	3.173	0.001	0.001	17198.0	12879.0	1.0
cutpoints_concept[1]	4.501	0.199	4.122	4.870	0.001	0.001	17859.0	13765.0	1.0