#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 18 07:52:06 2020

@author: daniel s liscia

hpstatistics.py
"""
import pandas as pd
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import kendalltau
# Per il Matthews correlation coefficient (= Cramer's Phi):
from sklearn.metrics import matthews_corrcoef 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import cohen_kappa_score
# Per gli INTERVALLI DI CONFIDENZA
# from __future__ import print_function, division
from math import sqrt
from scipy.special import ndtri

path = '/home/daniel/Dropbox/python/csv/'

datafile = 'hpdf.csv'

datafile = path+datafile

print(datafile)

dfmicro=pd.read_csv(datafile, sep=';')
dfmicro=dfmicro.drop_duplicates()

datafile = 'hpdf-dp.csv'

datafile = path+datafile

print(datafile)

dfdigit=pd.read_csv(datafile, sep=';')
dfdigit=dfdigit.drop_duplicates()

print(dfmicro.columns)
# Index(['anno', 'isto', 'hp', 'npos'], dtype='object')

print(dfdigit.columns,'\n')
# Index(['anno', 'isto', 'hpdp', 'nposdp'], dtype='object')

# MERGE
df_joined = pd.merge(dfmicro, dfdigit, on=['anno', 'isto'])

print('df_joined.columns:')
print(df_joined.columns,'\n')

number_observations = len(df_joined)
print('Number of cases = %d'% number_observations)

# ---------GROUPBY-----------------------------------------------------
# n_by_state = df.groupby("state")["last_name"].count()
print('\nCOUNT GROUPS:')
print('-------------')
npos_grouped = df_joined.groupby(["npos"])["hp"].count()

print('\nNPOS GROUPED:')
print(npos_grouped)

nposdp_grouped = df_joined.groupby(["nposdp"])["hp"].count()
print('\nNPOSDP GROUPED:')
print(nposdp_grouped)

print('\n--------------------------------------------------------------')

contingency_table = pd.crosstab(df_joined.hp, df_joined.hpdp,
                          rownames=['WSI'],
                          colnames=['microscope'],
                          margins=True, margins_name="Total")
print('Crosstab:\n')
print(contingency_table)

print('\nNota: numeri perfetti\n')

# Chi-square test of independence.
print("Chi Square statistics:")
print("----------------------")
print("http://web.pdx.edu/~newsomj/pa551/lectur15.htm\
\nThe appropriate test to compare group differences with a dichotomous\
\noutcome is the chi-square statistic. And, we can also show that the\
\ntest of the phi coefficient is equivalent to the chi-square test:\n")

chi2, pval, dof, expected = chi2_contingency(contingency_table)

print("Degree of freedom = %d \nChi2 = %f \npval = %f" % (dof, chi2, pval))
print("\nExpected table:")
print("---------------")
print(expected)

print('\nIn statistics, the phi coefficient (or mean square contingency \
      \ncoefficient and denoted by φ or rφ) is a measure of association \
      \nfor two binary variables. Introduced by Karl Pearson,[1] ')


print("\n----------------------PEARSON'S---------------------------------\n")

# Pearson's Phi Coefficient
# da: https://en.wikipedia.org/wiki/Phi_coefficient

phi_coefficient = sqrt(chi2/number_observations)

print("Pearson's Phi Coefficient = %.3f" % phi_coefficient)

print("\n----------------------CRAMER'S----------------------------------")

print("\nIn statistics, Cramér's V (sometimes referred to as Cramér's phi \
       \nand denoted as φc) is a measure of association between two nominal\
       \nvariables, giving a value between 0 and +1 (inclusive).\
       \n 0 indicates no association between the two variables.\
       \n 1 indicates a strong association between the two variables.\
       \n\
       \nPhi is used for 2x2 matrices and Cramer's V for more than 2x2\
       \nCramer's V is a way of calculating correlation in tables which \
       \nhave more than 2x2 rows and columns. It is used as post-test to\
       \ndetermine strengths of association after chi-square has determined\
       \nsignificance. V is calculated by first calculating chi-square")

# print("\ndf_joined[['hp','hpdp']]:")
# print(df_joined[['hp','hpdp']])

def cramers_corrected_stat(confusion_matrix):
    """ calculate Cramers V statistic for categorical-categorical association.
        uses correction from Bergsma and Wicher, 
        Journal of the Korean Statistical Society 42 (2013): 323-328
    """
    # chi2 = ss.chi2_contingency(confusion_matrix)[0]
    # https://stackoverflow.com/questions/45020538/
    # how-to-apply-cramer-v-on-2x2-matrix:
    # When you compute the Cramér coefficient, you must compute chi2 
    # without continuity correction. For a 2x2 matrix, chi2_contingency 
    # uses continuity correction by default. So you must tell chi2_contingency
    # to not use continuity correction by giving the argument correction=False
    # chi2 = ss.chi2_contingency(confusion_matrix, correction=False)[0]
    chi2 = chi2_contingency(confusion_matrix, correction=False)[0]     
    n = confusion_matrix.sum().sum()
    phi2 = chi2/n
    r,k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))    
    rcorr = r - ((r-1)**2)/(n-1)
    kcorr = k - ((k-1)**2)/(n-1)
    return np.sqrt(phi2corr / min( (kcorr-1), (rcorr-1)))

contingency_table = pd.crosstab(df_joined.hp, df_joined.hpdp)

cramerstat = cramers_corrected_stat(contingency_table)
print("\nPositive/Negative microscopy vs DP:")
# print("\nCramer's corrected statistics = %.3f" % cramerstat)
print("\nCramer’s Phi for 2x2 tables = %.3f" % cramerstat)


print('\n--------------------------------------------------------------')
print("Cramer's V following https://www.statology.org/cramers-v-in-python/" )
contingency_table2 = pd.crosstab(df_joined.hp, df_joined.hpdp, margins=False)

data = contingency_table2.astype(int).to_numpy()
print("\nDati per la Cramer's seconda maniera:")
print(data)

#Chi-squared test statistic, sample size, and minimum of rows and columns
X2 = chi2_contingency(data, correction=False)[0]
n = np.sum(data)
minDim = min(data.shape)-1

#calculate Cramer's V 
V = np.sqrt((X2/n) / minDim)

#display Cramer's V
print("\nCramer’s V for a 2×2 Table (Phi) = %.3f" % V)

print('\n--------------------------------------------------------------')

# prepare data for kendall's test and for Matthews correlation coefficient:

# di seguito funge ma ottieni solo la colonna 'hp'
# df_joined_int = df_joined['hp'].map({'neg':0, 'pos':1})

df_ordinal_hp = df_joined['hp'].map({'neg':0, 'pos':1})
df_ordinal_hpdp = df_joined['hpdp'].map({'neg':0, 'pos':1})


# print(df_ordinal_hp)
# print(df_ordinal_hpdp)

hp_array = df_ordinal_hp.astype(int).to_numpy()
# print(hp_array)

hpdp_array = df_ordinal_hpdp.astype(int).to_numpy()
# print(hpdp_array)

y_true = hp_array   # Ground truth (correct) target values.
y_pred = hpdp_array # Estimated targets as returned by a classifier.

mcc = matthews_corrcoef(y_true, y_pred)

print("\nThe MCC is in essence a correlation coefficient value between\
\n-1 and +1. A coefficient of +1 represents a perfect prediction,\
\n0 an average random prediction and -1 an inverse prediction. \
\nThe statistic is also known as the phi coefficient.\
\n[source: Wikipedia]\
\nThe MCC is defined identically to Pearson's phi coefficient,\
\nintroduced by Karl Pearson (but the term MCC is widely used \
\nin the field of machine learning).")
       
print("\nCompute the Matthews correlation coefficient:\n")
print('MCC = %.3f' % mcc)

"""
Calculates Kendall’s tau, a correlation measure for ordinal data.

Kendall’s tau is a measure of the correspondence between two rankings.
Values close to 1 indicate strong agreement, values close to -1 indicate
strong disagreement. This is the tau-b version of Kendall’s tau which 
accounts for ties.

References

W.R. Knight, “A Computer Method for Calculating Kendall’s Tau with Ungrouped
Data”, Journal of the American Statistical Association, Vol. 61, No. 314,
Part 1, pp. 436-439, 1966.
"""

print('\n--------------------------------------------------------------')
print("Use Kendall's test for tables in which both rows and columns contain \
\nordered values. Kendall’s Tau is a non-parametric measure of \
\nrelationships between columns of ranked data. The Tau correlation \
\ncoefficient returns a value of 0 to 1, where:\
\n    0 is no relationship,\
\n    1 is a perfect relationship.")

# tau-coeff, p_value = kendalltau(hp_array, hpdp_array)
tau_coef, p = kendalltau(hp_array, hpdp_array)

print('\nCorrelazione di Kendall dei positivi e negativi micro/digital:')

print('\nKendall correlation coefficient = %.3f' % tau_coef)

print('\nKendall P value = %.4f' % p)

# interpret the significance
alpha = 0.05
if p > alpha:
	print('\nSamples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('\nSamples are correlated (reject H0) p = %.3f' % p)

print('\n--------------------------------------------------------------')
#_______Dati per gli 1+/2+/3+________________________________________

# anno  isto   hp npos
# anno  isto hpdp nposdp
   
df_ordinal_npos = df_joined['npos'].map({'0+':0, '1+':1, '2+':2,'3+':3})
df_ordinal_nposdp = df_joined['nposdp'].map({'0+':0, '1+':1, '2+':2,'3+':3})

npos_array = df_ordinal_npos.astype(int).to_numpy()
# print('\ndata_npos:')
# print(data_npos)

nposdp_array = df_ordinal_nposdp.astype(int).to_numpy()
# print('\ndata_nposdp:')
# print(data_nposdp)
# ---------------------------------------------------------------------

tau_coef, p = kendalltau(npos_array, nposdp_array)

print('\nCorrelazione di Kendall dei 0/1/2/3 micro/digital:')

print('\nKendall correlation coefficient = %.3f' % tau_coef)

print('\nKendall P value = %.4f' % p)

# interpret the significance
alpha = 0.05
if p > alpha:
	print('\nSamples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('\nSamples are correlated (reject H0) p = %.3f' % p)

print('\n--------------------------------------------------------------')

# Select elements from a Numpy array based on a single condition

# newArr = arr[arr < 10]
npos_array123 = npos_array[npos_array != 0]
nposdp_array123 = nposdp_array[npos_array != 0]

tau_coef, p = kendalltau(npos_array123, nposdp_array123)

print('\nCorrelazione di Kendall dei 1/2/3 micro/digital:')

print('\nKendall correlation coefficient = %.3f' % tau_coef)

print('\nKendall P value = %.4f' % p)

# interpret the significance
alpha = 0.05
if p > alpha:
	print('\nSamples are uncorrelated (fail to reject H0) p=%.3f' % p)
else:
	print('\nSamples are correlated (reject H0) p = %.3f' % p)
print('\n--------------------------------------------------------------')
# Prova a calcolare sensibilità e specificità
# from sklearn.metrics import confusion_matrix
# da https://stackoverflow.com/posts/53869466/revisions
# The one liner to get true positives etc. out of the confusion matrix is to
# [ravel](https://docs.scipy.org/doc/numpy/reference/generated/numpy.ravel.html)
# it:
# numpy.ravel(a, order='C')[source]¶
# Return a contiguous flattened array.

# Già assegnati sopra per il matthews_corrcoef
# y_true = hp_array   # Ground truth (correct) target values.
# y_pred = hpdp_array # Estimated targets as returned by a classifier.

tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

print('Confusion matrix hp vs hpdp:', tn, fp, fn, tp)

print('\nTotale tn+fp+fn+tp = %d' % (tn+fp+fn+tp))

print('\nPerformance_measures:\n')
print('True Positive:', tp)
print('False Positive:', fp)
print('True Negative:', tn)
print('False Negative:', fn)
# preso da:
#   Baratloo A, Hosseini M, Negida A, El Ashal G. Part 1: 
#   Simple Definition and Calculation of Accuracy, Sensitivity 
#   and Specificity. Emerg (Tehran). 2015;3(2):48-49.
print('Accuracy: %.3f' % ((tp+tn)/(tp+tn+fp+fn)),'(tp+tn/tp+tn+fp+fn)')
print('Sensitivity: %.3f' % (tp/(tp+fn)),'(Recall, True Positive Rate, TPR)' )
print('Specificity: %.3f' % (tn/(tn+fp)),'(True Negative Rate - TNR)' )
print('PPV: %.3f' % (tp/(tp+fp)),'(Precision - Positive Predictive Value)' )
print('NPV: %.3f' % (tn/(tn+fn)) )

print('\n-------------------------------------------------------------')

#------------ INTERVALLI DI CONFIDENZA -------------------------------
# Da: https://gist.github.com/maidens/29939b3383a5e57935491303cf0d8e0b

def _proportion_confidence_interval(r, n, z):
    """Compute confidence interval for a proportion.
    
    Follows notation described on pages 46--47 of [1]. 
    
    References
    ----------
    [1] R. G. Newcombe and D. G. Altman, Proportions and their differences, in Statisics
    with Confidence: Confidence intervals and statisctical guidelines, 2nd Ed., D. G. Altman, 
    D. Machin, T. N. Bryant and M. J. Gardner (Eds.), pp. 45-57, BMJ Books, 2000. 
    """
    
    A = 2*r + z**2
    B = z*sqrt(z**2 + 4*r*(1 - r/n))
    C = 2*(n + z**2)
    return ((A-B)/C, (A+B)/C)

def sensitivity_and_specificity_with_confidence_intervals(TP, FP, FN, TN, alpha=0.95):
    """Compute confidence intervals for sensitivity and specificity using Wilson's method. 
    
    This method does not rely on a normal approximation and results in accurate 
    confidence intervals even for small sample sizes.
    
    Parameters
    ----------
    TP : int
        Number of true positives
    FP : int 
        Number of false positives
    FN : int
        Number of false negatives
    TN : int
        Number of true negatives
    alpha : float, optional
        Desired confidence. Defaults to 0.95, which yields a 95% confidence interval. 
    
    Returns
    -------
    sensitivity_point_estimate : float
        Numerical estimate of the test sensitivity
    specificity_point_estimate : float
        Numerical estimate of the test specificity
    sensitivity_confidence_interval : Tuple (float, float)
        Lower and upper bounds on the alpha confidence interval for sensitivity
    specificity_confidence_interval
        Lower and upper bounds on the alpha confidence interval for specificity 
        
    References
    ----------
    [1] R. G. Newcombe and D. G. Altman, Proportions and their differences, in Statisics
    with Confidence: Confidence intervals and statisctical guidelines, 2nd Ed., D. G. Altman, 
    D. Machin, T. N. Bryant and M. J. Gardner (Eds.), pp. 45-57, BMJ Books, 2000. 
    [2] E. B. Wilson, Probable inference, the law of succession, and statistical inference,
    J Am Stat Assoc 22:209-12, 1927. 
    """
    
    # 
    z = -ndtri((1.0-alpha)/2)
    
    # Compute sensitivity using method described in [1]
    sensitivity_point_estimate = TP/(TP + FN)
    sensitivity_confidence_interval = _proportion_confidence_interval(TP, TP + FN, z)
    
    # Compute specificity using method described in [1]
    specificity_point_estimate = TN/(TN + FP)
    specificity_confidence_interval = _proportion_confidence_interval(TN, TN + FP, z)
    
    return sensitivity_point_estimate, specificity_point_estimate, \
           sensitivity_confidence_interval, specificity_confidence_interval

print('\nConfidence Intervals:\n')

for a in [0., 0.5, 0.9, 0.95, 0.99, 0.999999]:
    sensitivity_point_estimate, specificity_point_estimate, \
        sensitivity_confidence_interval, specificity_confidence_interval \
        = sensitivity_and_specificity_with_confidence_intervals(tp, fp, fn, tn, alpha=a)
    # sensitivity_and_specificity_with_confidence_intervals(tp, fp, fn, tn, alpha=0.95)    
    print("Sensitivity: %.3f, Specificity:  %.3f" %(sensitivity_point_estimate, specificity_point_estimate))
    print("alpha =  %.3f CI for sensitivity:"%a, sensitivity_confidence_interval)
    print("alpha =  %.3f CI for specificity:"%a, specificity_confidence_interval)
    print("")

print('\n-------------------------------------------------------------')

print('\nStampa i falsi positivi e i falsi negativi:\n')

print('FALSI POSITIVI (micro neg - digit pos): \n')
print(df_joined.loc[(df_joined['hp'] == 'neg') & (df_joined['hpdp'] == 'pos')])

print('\n')
print('FALSI NEGATIVI (micro pos - digit neg): \n')
print(df_joined.loc[(df_joined['hp'] == 'pos') & (df_joined['hpdp'] == 'neg')])


print("\nCohen's KAPPA-------------------------------------------------\n")

# usa: from sklearn.metrics import cohen_kappa_score
# Già assegnati sopra per il matthews_corrcoef
# y_true = hp_array   # Ground truth (correct) target values.
# y_pred = hpdp_array # Estimated targets as returned by a classifier.

kappa_score = cohen_kappa_score(y_true, y_pred)
print("Cohen's Kappa statistics for pos vs. neg = %.3f "% kappa_score)

print('\n--------------------------------------------------------------\n')
# Vedi HALA M. T. EL-ZIMAITY 1996
# Select elements from a Numpy array 

y_true = npos_array
y_pred = nposdp_array

# funge!
# print('y_true',y_true)
# print('y_pred',y_pred)

kappa_score = cohen_kappa_score(y_true, y_pred)
print("Cohen's Kappa statistics for 0+/1+/2+/3+ = %.3f "% kappa_score)

print('\n--------------------------------------------------------------\n')
# Cancella i 0+ dagli array:
# Select elements from a Numpy array based on Conditions                          
# newArr = arr[arr < 10]
# per npos_array, nposdp_array
# vedi riga 259 per il Kendall Tau
y_true = npos_array[npos_array != 0]
y_pred = nposdp_array[nposdp_array != 0]

# funge!
# print('y_true',y_true)
# print('y_pred',y_pred)

kappa_score = cohen_kappa_score(y_true, y_pred)
print("Cohen's Kappa statistics for 1+/2+/3+ = %.3f "% kappa_score)

print('\n--------------------------------------------------------------\n')