#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Oct  4 19:10:12 2020

@author: daniel s liscia

regexcode.py

Unstructured free-text of the pathology reports are turned
into computationally usable data, stored in tabular format
"""

import pandas as pd
import re

path = '../csv/'
datafile = 'gastricbiopsies.csv'
# datafile = 'biopsiegastriche2019-20.csv'

datafile = path+datafile

print(datafile)

df=pd.read_csv(datafile, sep=';' )
                                                                                    
# define 4 arrays to create 4 columns dataframes

anno, isto, hpdata, npos = [], [], [], []

biopsie = negative = positive = 0

# The most efficient way to loop through dataframes:
# itertuples is 100 times faster than iterrows.
# tested with the compiled version of Regex

patten1 = re.compile("heli.*?ri", re.IGNORECASE)
patten2 = re.compile("w.*?y", re.IGNORECASE)

for row in df.itertuples():
    
    biopsie += 1
    diagnosi = row.Diagnosi.lower() # all diagnoses in lowercase
    match1 = patten1.search(diagnosi) 
    
    if match1:    
            match2 = patten2.search(diagnosi) 
            if match2:
                if 'negativ' in diagnosi:
                    anno.append(row.Anno)
                    isto.append(row.Numero)
                    hpdata.append('neg')
                    npos.append('0+')
                    negative += 1
                    # elif: "if the previous conditions were not true, 
                    # then try this condition"
                elif not ('negativ' and 'positiv') in diagnosi:
                    anno.append(row.Anno)
                    isto.append(row.Numero)
                    hpdata.append('neg') # it should be 'dub' (for dubious)
                    npos.append('0+')
                    negative += 1
                else:
                    anno.append(row.Anno)
                    isto.append(row.Numero)
                    hpdata.append('pos')
                    # ritorna 1 o 2 o 3 seguito da un+ compreso fra parentesi
                    # poshp = re.search(r'\((1|2|3)\+\)', row.Diagnosi)
                    # funziona ma è più elegante così:
                    poshp = re.search(r'\([1-3]\+\)', diagnosi)
                    # print('poshp:',poshp)
                    positive += 1
                    if poshp != None:
                        # If you simply want to remove all parentheses in 
                        # the string, use this
                        poshp = re.sub(r'[()]', '', poshp.group(0)[0:5])
                        # Python String strip() function will remove leading and 
                        # trailing whitespaces.
                        # print('poshp.strip():', poshp.strip())
                        npos.append(poshp.strip())
                    else:
                        npos.append('1+')
                        len(anno)                

# Controlla i quattro array:
print()
# print('len(anno):',len(anno))
# print('len(isto)', len(isto))
# print('len(hpdata):', len(hpdata))
# print('len(npos):', len(npos))
print('Gastric biopsies 2019-2020:', biopsie)
print('Stained with Warthin-Starry (regex):', negative+positive)
print('Negative:', negative)
print('Positive:', positive)

# for i in range(len(npos)):
#     print(i, anno[i], isto[i], hpdata[i], npos[i])

# Genera il dataframe:
hpdf = pd.DataFrame({'anno': anno,
                      'isto': isto,
                      'hp'  : hpdata,
                      'npos': npos})

hpdf.to_excel("../csv/hpdf_regex.xlsx")

print('\nTabular data have been saved in file ../csv/hpdf_regex.xlsx')