Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- """
- A script to determine how many questions in a given annotation file received "Off-Topic" responses.
- Things this script does:
- - Examines all responses in file and calculates total of responses that are "Off-Topic".
- Output:
- - An excel file containing all questions that triggered Off-Topic responses
- - Totals: Number of questions, number of questions with on-topic answer, number of questions with off-topic answer
- Params:
- """
- import re
- import os
- import openpyxl
- from openpyxl import Workbook
- def assessCoverageGaps(fileName, whichPath):
- # For each annotated transcript, add all unique response ID's to dict as key, with type annotation as value
- # For each link annotation file, compare responses to dict and compile totals of how many were "off topics"
- # Need to create a list of all types deemed "Off-Topic"
- # NEED TO CONSIDER ONLY QUESTIONS THAT HAVE BEEN LINK ANNOTATED OR RATED AS 4, CHECK LINK ANNOTATION TYPES TOO!!!! <------------
- # Read all file names from annotation folder and store only annotated transcripts to list
- annotationRE = re.compile(r'Annotated')
- #fullDirectoryList = os.listdir("C:\\Users\\CENSORED\\DeskEtop\\svn_ndt\\testing\\Annotations\\type-annotations")
- fullDirectoryList = os.listdir("C:\\Users\\mello\\Desktop\\testing\\Annotations\\type-annotations")
- annotatedTranscriptList = []
- for name in fullDirectoryList:
- if annotationRE.search(name) and not name == "Annotated - Eva-Kor-Full-Transcript-Jill.xlsx":
- annotatedTranscriptList.append(name)
- print(annotatedTranscriptList)
- # A Dict to hold all response IDs and their corresponding types
- typeDict = {}
- # Read each file in one by one, find "ID" column, then iter through all cells in that column, adding unique IDs to Dict with type as value
- for transcript in annotatedTranscriptList:
- #annotatedTranscript = openpyxl.load_workbook("C:\\Users\\CENSORED\\Desktop\\svn_ndt\\testing\\Annotations\\type-annotations\\"+transcript)
- annotatedTranscript = openpyxl.load_workbook("C:\\Users\\mello\\Desktop\\testing\\Annotations\\type-annotations\\"+transcript)
- annotatedSheet = annotatedTranscript.active
- # Find column letter for ID
- IDRE = re.compile(r'id')
- IDColumn = "ZZ"
- for col in annotatedSheet.iter_cols(min_row = 1, max_col = annotatedSheet.max_column, max_row = 1):
- for cell in col:
- if cell.value:
- if IDRE.search((cell.value).lower()):
- IDColumn = cell.column
- # Find column letter for type
- typeRE = re.compile(r'type')
- typeColumn = ""
- for col in annotatedSheet.iter_cols(min_row = 1, max_col = annotatedSheet.max_column, max_row = 1):
- for cell in col:
- if cell.value:
- if typeRE.search((cell.value).lower()):
- typeColumn = cell.column
- # Compile typeDict
- for cellRows in annotatedSheet[IDColumn+"2":IDColumn+str(annotatedSheet.max_row)]:
- for cell in cellRows:
- if cell.value:
- if cell.value not in typeDict and annotatedSheet[typeColumn+str(cell.row)].value:
- #print("yes")
- typeDict[cell.value] = annotatedSheet[typeColumn+str(cell.row)].value
- # The file path for the annotation files
- # Work Machine
- """
- if whichPath == "machine":
- newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-machine\\excel-version\\"
- elif whichPath == "survivor":
- newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\"
- elif whichPath == "pinchas":
- newFilePath = "C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\additional-questions-for-linking"
- else:
- print("Invalid whichPath")
- """
- # Home Machine
- if whichPath == "machine":
- newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-machine\\excel-version\\"
- elif whichPath == "survivor":
- newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\"
- elif whichPath == "pinchas":
- newFilePath = "C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\additional-questions-for-linking"
- else:
- print("Invalid whichPath")
- # Open link annotation file
- thisFile = openpyxl.load_workbook(newFilePath+fileName+".xlsx")
- thisSheet = thisFile.active
- # Assign column letters for each survivor
- pinchasLetter = "ZZ"
- samLetter = "ZZ"
- fritzieLetter = "ZZ"
- aaronLetter = "ZZ"
- izzyLetter = "ZZ"
- adinaLetter = "ZZ"
- janineLetter = "ZZ"
- matusLetter = "ZZ"
- evaKLetter = "ZZ"
- evaSLetter = "ZZ"
- anitaLetter = "ZZ"
- reneeLetter = "ZZ"
- for col in thisSheet.iter_cols(min_row = 1, max_col = thisSheet.max_column, max_row = 1):
- for cell in col:
- if cell.value:
- cellValueLow = (cell.value).lower()
- cellValueStripped = cellValueLow.strip()
- if cellValueStripped == "pinchas":
- pinchasLetter = cell.column
- elif cellValueStripped == "sam":
- samLetter = cell.column
- elif cellValueStripped == "fritzie":
- fritzieLetter = cell.column
- elif cellValueStripped == "aaron":
- aaronLetter = cell.column
- elif cellValueStripped == "izzy":
- izzyLetter = cell.column
- elif cellValueStripped == "adina":
- adinaLetter = cell.column
- elif cellValueStripped == "janine":
- janineLetter = cell.column
- elif cellValueStripped == "matus":
- matusLetter = cell.column
- elif cellValueStripped == "eva k":
- evaKLetter = cell.column
- elif cellValueStripped == "eva s":
- evaSLetter = cell.column
- elif cellValueStripped == "anita":
- anitaLetter = cell.column
- elif cellValueStripped == "renee":
- reneeLetter = cell.column
- print("Pinchas: "+pinchasLetter)
- # Find column letter for Response ID, asr, user text, On-Topic and Off-Topic
- responseIDRE = re.compile(r'response id')
- responseIDColumn = "ZZ"
- asrColumn = "ZZ"
- asrRE = re.compile(r'asr')
- userTextColumn = "ZZ"
- userTextRE = re.compile(r'user')
- onTopicRE = re.compile(r'on-topic')
- onTopicColumn = "ZZ"
- offTopicRE = re.compile(r'off-topic')
- offTopicColumn = "ZZ"
- for col in thisSheet.iter_cols(min_row = 1, max_col = thisSheet.max_column, max_row = 1):
- for cell in col:
- if cell.value:
- if responseIDRE.search((cell.value).lower()):
- responseIDColumn = cell.column
- elif asrRE.search((cell.value).lower()):
- asrColumn = cell.column
- elif userTextRE.search((cell.value).lower()):
- userTextColumn = cell.column
- elif onTopicRE.search((cell.value).lower()):
- onTopicColumn = cell.column
- elif offTopicRE.search((cell.value).lower()):
- offTopicColumn = cell.column
- print("ResponseID: "+responseIDColumn)
- print("ASR: "+asrColumn)
- print("UserText: "+userTextColumn)
- print("On-Topic: "+onTopicColumn)
- print("Off-Topic: "+offTopicColumn)
- # List of all "off-topic" types
- offTopicTypes = ["cant_answer", "cant_answer_reason", "cant_answer_time", "cant_answer_war", "change_topic", "direct_to_resource", "dont_understand", "other_experience",
- "rephrase", "say_again", "segue_to_related_topic", "seque_to_specific_story", "segue_to_story", "subject_idea", "subject_yes_no", "topic_intro",
- "topic_request_for_story"]
- # List of "won't answer" types - ANALYZE SEPERATELY FROM OFF-TOPICS!!!
- wontAnswerTypes = ["wont_answer", "wont_answer_inappropriate", "wont_answer_now", "wont_answer_too_personal"]
- # Calculate data totals and copy over questions, responses and types for off-topic responses
- totalQuestions = 0
- offTopicQuestions = 0
- onTopicQuestions = 0
- unrated = 0
- wontAnswerQuestions = 0
- IDNotInDict = 0
- gapCoverage = Workbook()
- gapCoverageSheet = gapCoverage.active
- gapCoverageSheet["A1"] = "Question"
- gapCoverageSheet["B1"] = "Response"
- gapCoverageSheet["C1"] = "Type"
- gapCoverageSheet["D1"] = "Where?"
- gapCoverageSheet["E1"] = "Total Questions"
- gapCoverageSheet["F1"] = "Off Topic"
- gapCoverageSheet["G1"] = "On Topic"
- gapCoverageSheet["H1"] = "ID Changed"
- gapCoverageSheet["I1"] = "Unrated"
- gapCoverageSheet["J1"] = "Old ID"
- gapCoverageSheet["K1"] = "Invalid Rating"
- gapCoverageSheet["L1"] = "Location"
- gapCoverageCounter = 2
- oldIDCounter = 2
- invalidCounter = 2
- # A dict with survivor number as key and survivor letter as value
- letterDict = {}
- letterDict["0"] = pinchasLetter
- letterDict["2"] = anitaLetter
- letterDict["3"] = samLetter
- letterDict["4"] = fritzieLetter
- letterDict["5"] = aaronLetter
- letterDict["6"] = reneeLetter
- letterDict["7"] = evaSLetter
- letterDict["8"] = adinaLetter
- letterDict["9"] = janineLetter
- letterDict["10"] = izzyLetter
- letterDict["11"] = matusLetter
- letterDict["12"] = evaKLetter
- # an RE to check for annotation NO_GOOD_RESPONSE
- noRE = re.compile(r'no')
- # 1) First check if the question/response pair was rated as a 4 or 3, if it was a 4, only check type of ID in response ID column [DONE]
- # if it was a 3, check if there are alternatives given, if there are check the type of the alternatives, if not, check type of ID in response ID column [DONE]
- # if it was a 1 or 2 check if there are alternatives given, if there are check the type of the alternatives [DONE]
- # also check for "NO_GOOD_RESPONSE" annotations, in these cases, count these as "offTopicQuestions" <--
- # also check for ratings of 5 and 6!!! [DONE]
- # add column to output sheet to indicate if off topic response is from annotation column or response ID column [DONE]
- for cellRows in thisSheet[asrColumn+"2":asrColumn+str(thisSheet.max_row)]:
- for cell in cellRows:
- if cell.value and thisSheet[responseIDColumn+str(cell.row)].value:
- totalQuestions += 1
- # store response ID and all associated annotations in variables
- thisResponseID = thisSheet[responseIDColumn+str(cell.row)].value
- thisOnTopic = thisSheet[onTopicColumn+str(cell.row)].value
- thisOffTopic = thisSheet[offTopicColumn+str(cell.row)].value
- # If the response is rated as 4
- if thisOnTopic == 4:
- # If response ID is in the type Dict
- if thisResponseID in typeDict:
- # If this response is off topic, increment counter and copy data to gap coverage sheet
- if typeDict[thisResponseID] in offTopicTypes:
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
- gapCoverageCounter += 1
- # If this response is on topic, increment counter
- else:
- onTopicQuestions += 1
- # If response ID not in type Dict, tally number of ID's not in Dict
- else:
- IDNotInDict += 1
- gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
- oldIDCounter += 1
- # If the response is rated as 3
- elif thisOnTopic == 3:
- # Extract survivor number from response ID
- pinchasRE = re.compile(r'0')
- splitResponse = thisResponseID.split("-")
- if pinchasRE.match(splitResponse[0]):
- survivorNumber = "0"
- #print("EES")
- else:
- survivorNumber = splitResponse[0]
- # Check if alternative responses are provided in survivor column
- if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
- # Split cell value into list of individual IDs
- cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
- isOffTopic = "false"
- noGoodResponse = "false"
- OTResponseID = "ZZ"
- #print("YES")
- # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
- for ID in cellSplit:
- if ID in typeDict:
- if typeDict[ID] in offTopicTypes:
- isOffTopic = "true"
- OTResponseID = ID
- elif noRE.match(ID.lower()):
- noGoodResponse = "true"
- noGoodResponseID = ID
- else:
- IDNotInDict += 1
- gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
- oldIDCounter += 1
- # If isOffTopic is true, add question to off topic count, else add to on topic count
- if isOffTopic == "true":
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
- gapCoverageCounter += 1
- elif noGoodResponse == "true":
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = noGoodResponseID
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
- gapCoverageCounter += 1
- else:
- onTopicQuestions += 1
- # If no alt responses are given for this 3, check the response ID column instead
- else:
- # If response ID is in the type Dict
- if thisResponseID in typeDict:
- # If this response is off topic, increment counter and copy data to gap coverage sheet
- if typeDict[thisResponseID] in offTopicTypes:
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
- gapCoverageCounter += 1
- # If this response is on topic, increment counter
- else:
- onTopicQuestions += 1
- # If response ID not in type Dict, tally number of ID's not in Dict
- else:
- IDNotInDict += 1
- gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
- oldIDCounter += 1
- # If the response is rated as 2 or 1
- elif thisOnTopic == 2 or thisOnTopic == 1:
- # Extract survivor number from response ID
- pinchasRE = re.compile(r'0')
- splitResponse = thisResponseID.split("-")
- if pinchasRE.match(splitResponse[0]):
- survivorNumber = "0"
- #print("EES")
- else:
- survivorNumber = splitResponse[0]
- # Check if alternative responses are provided in survivor column
- if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
- # Split cell value into list of individual IDs
- cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
- isOffTopic = "false"
- noGoodResponse = "false"
- OTResponseID = "ZZ"
- #print("YES")
- # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
- for ID in cellSplit:
- #print(ID)
- if ID in typeDict:
- if typeDict[ID] in offTopicTypes:
- isOffTopic = "true"
- OTResponseID = ID
- elif noRE.match(ID.lower()):
- print("found one")
- noGoodResponse = "true"
- noGoodResponseID = ID
- else:
- IDNotInDict += 1
- gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
- oldIDCounter += 1
- # If isOffTopic is true, add question to off topic count, else add to on topic count
- if isOffTopic == "true":
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
- gapCoverageCounter += 1
- elif noGoodResponse == "true":
- print("yes")
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = noGoodResponseID
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
- gapCoverageCounter += 1
- else:
- onTopicQuestions += 1
- # If rating is not 1-4, add as invalid rating
- else:
- if not thisSheet[offTopicColumn+str(cell.row)].value:
- gapCoverageSheet["K"+str(invalidCounter)] = thisOnTopic
- gapCoverageSheet["L"+str(invalidCounter)] = onTopicColumn+str(cell.row)
- invalidCounter += 1
- # If off topic rating is 5, check annotations to see if OT is annotated
- if thisOffTopic == 5:
- # Extract survivor number from response ID
- pinchasRE = re.compile(r'0')
- splitResponse = thisResponseID.split("-")
- if pinchasRE.match(splitResponse[0]):
- survivorNumber = "0"
- #print("EES")
- else:
- survivorNumber = splitResponse[0]
- # Check if alternative responses are provided in survivor column
- if thisSheet[(letterDict[survivorNumber])+str(cell.row)].value:
- # Split cell value into list of individual IDs
- cellSplit = (thisSheet[(letterDict[survivorNumber])+str(cell.row)].value).split("\n")
- isOffTopic = "false"
- OTResponseID = "ZZ"
- #print("YES")
- # Check each ID to see if it is Off Topic, if at least 1 ID is Off Topic set isOffTopic to true
- for ID in cellSplit:
- if ID in typeDict:
- if typeDict[ID] in offTopicTypes:
- isOffTopic = "true"
- OTResponseID = ID
- else:
- IDNotInDict += 1
- gapCoverageSheet["J"+str(oldIDCounter)] = thisResponseID
- oldIDCounter += 1
- # If isOffTopic is true, add question to off topic count, else add to on topic count
- if isOffTopic == "true":
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = OTResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[OTResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "annotation"
- gapCoverageCounter += 1
- else:
- onTopicQuestions += 1
- # If off topic rating is 6, add to off topic count and output
- elif thisOffTopic == 6:
- offTopicQuestions += 1
- # If User Text column has transcription, use this as question, else use value in asr column
- if thisSheet[userTextColumn+str(cell.row)].value and not (thisSheet[userTextColumn+str(cell.row)].value).lower() == "check audio":
- gapCoverageSheet["A"+str(gapCoverageCounter)] = thisSheet[userTextColumn+str(cell.row)].value
- else:
- gapCoverageSheet["A"+str(gapCoverageCounter)] = cell.value
- gapCoverageSheet["B"+str(gapCoverageCounter)] = thisResponseID
- gapCoverageSheet["C"+str(gapCoverageCounter)] = typeDict[thisResponseID]
- gapCoverageSheet["D"+str(gapCoverageCounter)] = "response"
- gapCoverageCounter += 1
- else:
- if not thisSheet[onTopicColumn+str(cell.row)].value:
- gapCoverageSheet["K"+str(invalidCounter)] = thisOnTopic
- gapCoverageSheet["L"+str(invalidCounter)] = onTopicColumn+str(cell.row)
- invalidCounter += 1
- if not thisSheet[onTopicColumn+str(cell.row)].value and not thisSheet[offTopicColumn+str(cell.row)].value:
- unrated += 1
- gapCoverageSheet["E2"] = totalQuestions
- gapCoverageSheet["F2"] = offTopicQuestions
- gapCoverageSheet["G2"] = onTopicQuestions
- gapCoverageSheet["H2"] = IDNotInDict
- gapCoverageSheet["I2"] = unrated
- #gapCoverage.save("C:\\Users\\CENSORED\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\excel-version\\test-gap-coverage.xlsx")
- gapCoverage.save("C:\\Users\\mello\\Desktop\\svn_ndt_all\\data-analysis\\alfred-testing-transcripts\\by-survivor\\excel-version\\test-gap-coverage.xlsx")
- #assessCoverageGaps("helena-alfred-transcripts", "machine")
- assessCoverageGaps("sfa-test", "survivor")
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement