inelegant

"""

A script to determine how many questions in a given annotation file received "Off-Topic" responses.

Things this script does:
 - Examines all responses in file and calculates total of responses that are "Off-Topic".

Output:
 - An excel file containing all questions that triggered Off-Topic responses
 - Totals: Number of questions, number of questions with on-topic answer, number of questions with off-topic answer

Params:

"""

import re
import os
import openpyxl
from openpyxl import Workbook

def assessCoverageGaps(fileName, whichPath):

    # For each annotated transcript, add all unique response ID's to dict as key, with type annotation as value
    # For each link annotation file, compare responses to dict and compile totals of how many were "off topics"
    # Need to create a list of all types deemed "Off-Topic"
    # NEED TO CONSIDER ONLY QUESTIONS THAT HAVE BEEN LINK ANNOTATED OR RATED AS 4, CHECK LINK ANNOTATION TYPES TOO!!!! <------------


    # Read all file names from annotation folder and store only annotated transcripts to list
    annotationRE = re.compile(r'Annotated')
    #fullDirectoryList = os.listdir("C:\\Users\\CENSORED\\DeskEtop\\svn_ndt\\testing\\Annotations\\type-annotations")
    fullDirectoryList = os.listdir("C:\\Users\\mello\\Desktop\\testing\\Annotations\\type-annotations")
    annotatedTranscriptList = []
    for name in fullDirectoryList:
        if annotationRE.search(name) and not name == "Annotated - Eva-Kor-Full-Transcript-Jill.xlsx":
            annotatedTranscriptList.append(name)

    print(annotatedTranscriptList)

    # A Dict to hold all response IDs and their corresponding types
    typeDict = {}

    # Read each file in one by one, find "ID" column, then iter through all cells in that column, adding unique IDs to Dict with type as value
    for transcript in annotatedTranscriptList:
        #annotatedTranscript = openpyxl.load_workbook("C:\\Users\\CENSORED\\Desktop\\svn_ndt\\testing\\Annotations\\type-annotations\\"+transcript)
        annotatedTranscript = openpyxl.load_workbook("C:\\Users\\mello\\Desktop\\testing\\Annotations\\type-annotations\\"+transcript)
        annotatedSheet = annotatedTranscript.active


        # Find column letter for ID
        IDRE = re.compile(r'id')
        IDColumn = "ZZ"

        for col in annotatedSheet.iter_cols(min_row = 1, max_col = annotatedSheet.max_column, max_row = 1):
            for cell in col:
                if cell.value:
                    if IDRE.search((cell.value).lower()):
                        IDColumn = cell.column


        # Find column letter for type
        typeRE = re.compile(r'type')
        typeColumn = ""

        for col in annotatedSheet.iter_cols(min_row = 1, max_col = annotatedSheet.max_column, max_row = 1):
            for cell in col:
                if cell.value:
                    if typeRE.search((cell.value).lower()):
                        typeColumn = cell.column


        # Compile typeDict
        for cellRows in annotatedSheet[IDColumn+"2":IDColumn+str(annotatedSheet.max_row)]:
            for cell in cellRows:
                if cell.value:
                    if cell.value not in typeDict and annotatedSheet[typeColumn+str(cell.row)].value:
                        #print("yes")
                        typeDict[cell.value] = annotatedSheet[typeColumn+str(cell.row)].value


    # The file path for the annotation files
    # Work Machine
    """
    if whichPath == "machine":
        newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-machine\\excel-version\\"
    elif whichPath == "survivor":
        newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\"
    elif whichPath == "pinchas":
        newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\additional-questions-for-linking"
    else:
        print("Invalid whichPath")

    """
    # Home Machine
    if whichPath == "machine":
        newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-machine\\excel-version\\"
    elif whichPath == "survivor":
        newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\"
    elif whichPath == "pinchas":
        newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\additional-questions-for-linking"
    else:
        print("Invalid whichPath")


    # Open link annotation file
    thisFile = openpyxl.load_workbook(newFilePath+fileName+".xlsx")
    thisSheet = thisFile.active

    # Assign column letters for each survivor
    pinchasLetter = "ZZ"
    samLetter = "ZZ"
    fritzieLetter = "ZZ"
    aaronLetter = "ZZ"
    izzyLetter = "ZZ"
    adinaLetter = "ZZ"
    janineLetter = "ZZ"
    matusLetter = "ZZ"
    evaKLetter = "ZZ"
    evaSLetter = "ZZ"
    anitaLetter = "ZZ"
    reneeLetter = "ZZ"

    for col in thisSheet.iter_cols(min_row = 1, max_col = thisSheet.max_column, max_row = 1):
        for cell in col:
            if cell.value:
                cellValueLow = (cell.value).lower()
                cellValueStripped = cellValueLow.strip()
                if cellValueStripped == "pinchas":
                    pinchasLetter = cell.column
                elif cellValueStripped == "sam":
                    samLetter = cell.column
                elif cellValueStripped == "fritzie":
                    fritzieLetter = cell.column
                elif cellValueStripped == "aaron":
                    aaronLetter = cell.column
                elif cellValueStripped == "izzy":
                    izzyLetter = cell.column
                elif cellValueStripped == "adina":
                    adinaLetter = cell.column
                elif cellValueStripped == "janine":
                    janineLetter = cell.column
                elif cellValueStripped == "matus":
                    matusLetter = cell.column
                elif cellValueStripped == "eva k":
                    evaKLetter = cell.column
                elif cellValueStripped == "eva s":
                    evaSLetter = cell.column
                elif cellValueStripped == "anita":
                    anitaLetter = cell.column
                elif cellValueStripped == "renee":
                    reneeLetter = cell.column

    print("Pinchas: "+pinchasLetter)


    # Find column letter for Response ID, asr, user text, On-Topic and Off-Topic
    responseIDRE = re.compile(r'response id')
    responseIDColumn = "ZZ"
    asrColumn = "ZZ"
    asrRE = re.compile(r'asr')
    userTextColumn = "ZZ"
    userTextRE = re.compile(r'user')
    onTopicRE = re.compile(r'on-topic')
    onTopicColumn = "ZZ"
    offTopicRE = re.compile(r'off-topic')
    offTopicColumn = "ZZ"

    for col in thisSheet.iter_cols(min_row = 1, max_col = thisSheet.max_column, max_row = 1):
        for cell in col:
            if cell.value:
                if responseIDRE.search((cell.value).lower()):
                    responseIDColumn = cell.column
                elif asrRE.search((cell.value).lower()):
                    asrColumn = cell.column
                elif userTextRE.search((cell.value).lower()):
                    userTextColumn = cell.column
                elif onTopicRE.search((cell.value).lower()):
                    onTopicColumn = cell.column
                elif offTopicRE.search((cell.value).lower()):
                    offTopicColumn = cell.column

    print("ResponseID: "+responseIDColumn)
    print("ASR: "+asrColumn)
    print("UserText: "+userTextColumn)
    print("On-Topic: "+onTopicColumn)
    print("Off-Topic: "+offTopicColumn)


    # List of all "off-topic" types
    offTopicTypes = ["cant_answer", "cant_answer_reason", "cant_answer_time", "cant_answer_war", "change_topic", "direct_to_resource", "dont_understand", "other_experience",
                     "rephrase", "say_again", "segue_to_related_topic", "seque_to_specific_story", "segue_to_story", "subject_idea", "subject_yes_no", "topic_intro",
                     "topic_request_for_story"]


    # List of "won't answer" types - ANALYZE SEPERATELY FROM OFF-TOPICS!!!
    wontAnswerTypes = ["wont_answer", "wont_answer_inappropriate", "wont_answer_now", "wont_answer_too_personal"]


    # Calculate data totals and copy over questions, responses and types for off-topic responses
    totalQuestions = 0
    offTopicQuestions = 0
    onTopicQuestions = 0
    unrated = 0
    wontAnswerQuestions = 0
    IDNotInDict = 0

    gapCoverage = Workbook()
    gapCoverageSheet = gapCoverage.active
    gapCoverageSheet["A1"] = "Question"
    gapCoverageSheet["B1"] = "Response"
    gapCoverageSheet["C1"] = "Type"
    gapCoverageSheet["D1"] = "Where?"
    gapCoverageSheet["E1"] = "Total Questions"
    gapCoverageSheet["F1"] = "Off Topic"
    gapCoverageSheet["G1"] = "On Topic"
    gapCoverageSheet["H1"] = "ID Changed"
    gapCoverageSheet["I1"] = "Unrated"
    gapCoverageSheet["J1"] = "Old ID"
    gapCoverageSheet["K1"] = "Invalid Rating"
    gapCoverageSheet["L1"] = "Location"
    gapCoverageCounter = 2
    oldIDCounter = 2
    invalidCounter = 2

    # A dict with survivor number as key and survivor letter as value
    letterDict = {}
    letterDict["0"] = pinchasLetter
    letterDict["2"] = anitaLetter
    letterDict["3"] = samLetter
    letterDict["4"] = fritzieLetter
    letterDict["5"] = aaronLetter
    letterDict["6"] = reneeLetter
    letterDict["7"] = evaSLetter
    letterDict["8"] = adinaLetter
    letterDict["9"] = janineLetter
    letterDict["10"] = izzyLetter
    letterDict["11"] = matusLetter
    letterDict["12"] = evaKLetter

    # an RE to check for annotation NO_GOOD_RESPONSE
    noRE = re.compile(r'no')

    # 1) First check if the question/response pair was rated as a 4 or 3, if it was a 4, only check type of ID in response ID column [DONE]
    # if it was a 3, check if there are alternatives given, if there are check the type of the alternatives, if not, check type of ID in response ID column [DONE]
    # if it was a 1 or 2 check if there are alternatives given, if there are check the type of the alternatives [DONE]
    # also check for "NO_GOOD_RESPONSE" annotations, in these cases, count these as "offTopicQuestions" <--
    # also check for ratings of 5 and 6!!! [DONE]
    # add column to output sheet to indicate if off topic response is from annotation column or response ID column [DONE]
    for cellRows in thisSheet[asrColumn+"2":asrColumn+str(thisSheet.max_row)]:
        for cell in cellRows:
            if cell.value and thisSheet[responseIDColumn+str(cell.row)].value:
                totalQuestions += 1
                # store response ID and all associated annotations in variables
                thisResponseID = thisSheet[responseIDColumn+str(cell.row)].value
                thisOnTopic = thisSheet[onTopicColumn+str(cell.row)].value
                thisOffTopic = thisSheet[offTopicColumn+str(cell.row)].value


                # If the response is rated as 4
                if thisOnTopic == 4:
                    # If response ID is in the type Dict
                    if thisResponseID in typeDict:
                        # If this response is off topic, increment counter and copy data to gap coverage sheet
                        if typeDict[thisResponseID] in offTopicTypes:
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
                            gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
                            gapCoverageCounter += 1


                        # If this response is on topic, increment counter
                        else:
                            onTopicQuestions += 1

                    # If response ID not in type Dict, tally number of ID's not in Dict
                    else:
                        IDNotInDict += 1
                        gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
                        oldIDCounter += 1


                # If the response is rated as 3
                elif thisOnTopic == 3:
                    # Extract survivor number from response ID
                    pinchasRE = re.compile(r'0')
                    splitResponse = thisResponseID.split("-")

                    if pinchasRE.match(splitResponse[0]):
                        survivorNumber = "0"
                        #print("EES")
                    else:
                        survivorNumber = splitResponse[0]
                    # Check if alternative responses are provided in survivor column
                    if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
                        # Split cell value into list of individual IDs
                        cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
                        isOffTopic = "false"
                        noGoodResponse = "false"
                        OTResponseID = "ZZ"
                        #print("YES")
                        # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
                        for ID in cellSplit:
                            if ID in typeDict:
                                if typeDict[ID] in offTopicTypes:
                                    isOffTopic = "true"
                                    OTResponseID = ID

                            elif noRE.match(ID.lower()):
                                    noGoodResponse = "true"
                                    noGoodResponseID = ID
                            else:
                                IDNotInDict += 1
                                gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
                                oldIDCounter += 1


                        # If isOffTopic is true, add question to off topic count, else add to on topic count
                        if isOffTopic == "true":
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
                            gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
                            gapCoverageCounter += 1

                        elif noGoodResponse == "true":
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = noGoodResponseID
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
                            gapCoverageCounter += 1

                        else:
                            onTopicQuestions += 1


                    # If no alt responses are given for this 3, check the response ID column instead
                    else:
                        # If response ID is in the type Dict
                        if thisResponseID in typeDict:
                            # If this response is off topic, increment counter and copy data to gap coverage sheet
                            if typeDict[thisResponseID] in offTopicTypes:
                                offTopicQuestions += 1
                                # If User Text column has transcription, use this as question, else use value in asr column
                                if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                    gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                                else:
                                    gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                                gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
                                gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
                                gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
                                gapCoverageCounter += 1


                            # If this response is on topic, increment counter
                            else:
                                onTopicQuestions += 1

                        # If response ID not in type Dict, tally number of ID's not in Dict
                        else:
                            IDNotInDict += 1
                            gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
                            oldIDCounter += 1


                # If the response is rated as 2 or 1
                elif thisOnTopic == 2 or thisOnTopic == 1:
                    # Extract survivor number from response ID
                    pinchasRE = re.compile(r'0')
                    splitResponse = thisResponseID.split("-")

                    if pinchasRE.match(splitResponse[0]):
                        survivorNumber = "0"
                        #print("EES")
                    else:
                        survivorNumber = splitResponse[0]
                    # Check if alternative responses are provided in survivor column
                    if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
                        # Split cell value into list of individual IDs
                        cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
                        isOffTopic = "false"
                        noGoodResponse = "false"
                        OTResponseID = "ZZ"
                        #print("YES")
                        # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
                        for ID in cellSplit:
                            #print(ID)
                            if ID in typeDict:
                                if typeDict[ID] in offTopicTypes:
                                    isOffTopic = "true"
                                    OTResponseID = ID

                            elif noRE.match(ID.lower()):
                                    print("found one")
                                    noGoodResponse = "true"
                                    noGoodResponseID = ID

                            else:
                                IDNotInDict += 1
                                gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
                                oldIDCounter += 1


                        # If isOffTopic is true, add question to off topic count, else add to on topic count
                        if isOffTopic == "true":
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
                            gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
                            gapCoverageCounter += 1

                        elif noGoodResponse == "true":
                            print("yes")
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = noGoodResponseID
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
                            gapCoverageCounter += 1

                        else:
                            onTopicQuestions += 1


                # If rating is not 1-4, add as invalid rating
                else:
                    if not thisSheet[offTopicColumn+str(cell.row)].value:
                        gapCoverageSheet["K"+str(invalidCounter)] = thisOnTopic
                        gapCoverageSheet["L"+str(invalidCounter)] = onTopicColumn+str(cell.row)
                        invalidCounter += 1


                # If off topic rating is 5, check annotations to see if OT is annotated
                if thisOffTopic == 5:
                    # Extract survivor number from response ID
                    pinchasRE = re.compile(r'0')
                    splitResponse = thisResponseID.split("-")

                    if pinchasRE.match(splitResponse[0]):
                        survivorNumber = "0"
                        #print("EES")
                    else:
                        survivorNumber = splitResponse[0]
                    # Check if alternative responses are provided in survivor column
                    if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
                        # Split cell value into list of individual IDs
                        cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
                        isOffTopic = "false"
                        OTResponseID = "ZZ"
                        #print("YES")
                        # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
                        for ID in cellSplit:
                            if ID in typeDict:
                                if typeDict[ID] in offTopicTypes:
                                    isOffTopic = "true"
                                    OTResponseID = ID
                            else:
                                IDNotInDict += 1
                                gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
                                oldIDCounter += 1


                        # If isOffTopic is true, add question to off topic count, else add to on topic count
                        if isOffTopic == "true":
                            offTopicQuestions += 1
                            # If User Text column has transcription, use this as question, else use value in asr column
                            if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                            else:
                                gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                            gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
                            gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
                            gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
                            gapCoverageCounter += 1
                        else:
                            onTopicQuestions += 1

                # If off topic rating is 6, add to off topic count and output
                elif thisOffTopic == 6:
                    offTopicQuestions += 1
                    # If User Text column has transcription, use this as question, else use value in asr column
                    if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
                        gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
                    else:
                        gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
                    gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
                    gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
                    gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
                    gapCoverageCounter += 1


                else:
                    if not thisSheet[onTopicColumn+str(cell.row)].value:
                        gapCoverageSheet["K"+str(invalidCounter)] = thisOnTopic
                        gapCoverageSheet["L"+str(invalidCounter)] = onTopicColumn+str(cell.row)
                        invalidCounter += 1


                if not thisSheet[onTopicColumn+str(cell.row)].value and not thisSheet[offTopicColumn+str(cell.row)].value:
                    unrated += 1


    gapCoverageSheet["E2"] = totalQuestions
    gapCoverageSheet["F2"] = offTopicQuestions
    gapCoverageSheet["G2"] = onTopicQuestions
    gapCoverageSheet["H2"] = IDNotInDict
    gapCoverageSheet["I2"] = unrated

    #gapCoverage.save("C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\excel-version\\test-gap-coverage.xlsx")
    gapCoverage.save("C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\test-gap-coverage.xlsx")


#assessCoverageGaps("helena-alfred-transcripts", "machine")
assessCoverageGaps("sfa-test", "survivor")