Untitled

import csv
from pydub import AudioSegment
from os import walk
from os.path import join, basename
import numpy


def count_silent_chunks(chunks, threshold, rev=False):
    """
    count the number of chunks that are silent from start or end to first noise and return it as an integer"

    Keyword arguments:
    chunks      --  a list of chunks of a sound file
    threshold   --  dbFS value to compare against for detection of silence
    rev         --  which edge to start from (True = right edge)
    """
    silent_blocks = 0
    rng = reversed(xrange(len(chunks))) if rev else xrange(len(chunks))

    for i in rng:
        if chunks[i].dBFS == float('-inf') or chunks[i].dBFS < threshold:
            silent_blocks += 1
        else:
            break
    return silent_blocks


def get_silence(audio, interval, threshold, step):
    """get length of silence at edges in ms from a wav file and return
    {start: [ms, blocks], end: [ms, blocks], duration: ms, threshold: dbFS}

    Keyword arguments:
    audio       --  filename
    interval    --  size of chunks in ms
    threshold   --  start dbFS threshold for silence comparison
    step        --  amount to increment by when searching for silence threshold
    """

    # chop off this number of ms from end of file (mouse click)
    end_omit = 250
    wav = AudioSegment.from_wav(audio)[:-1 * end_omit]

    # break into chunks of interval ms
    chunks = [wav[i:i+interval]
              for i in range(0, len(wav), interval)]

    # min/max chunks of silence to guard against implausible results
    min_silence = 250 / interval
    max_silence = len(chunks) - min_silence - 1

    # find number of chunks with dBFS below threshold at start
    silent_blocks_start = 0
    selected_threshold = 0  # selected threshold
    for i in numpy.arange(threshold, 0, step):
        if silent_blocks_start > max_silence:
            silent_blocks_start = -1
            selected_threshold = 1
            break
        silent_blocks_start = count_silent_chunks(chunks, i)
        if silent_blocks_start > min_silence:
            selected_threshold = i
            break

    # find number of chunks with dBFS below threshold at end
    if selected_threshold < 0:
        silent_blocks_end = count_silent_chunks(
            chunks, selected_threshold, True)
    else:
        silent_blocks_end = -1

    if silent_blocks_end < min_silence + 1:
        silent_blocks_end = -1

    end_ms_silence = -1
    start_ms_silence = -1
    if silent_blocks_start > 0 and silent_blocks_start < max_silence:
        start_ms_silence = silent_blocks_start * interval - interval/2
    if silent_blocks_end > 0 and silent_blocks_end < max_silence:
        end_ms_silence = silent_blocks_end * interval - \
            interval/2 + end_omit

    return {"start": [start_ms_silence, silent_blocks_start], "end": [end_ms_silence, silent_blocks_end], "duration": len(chunks) * interval, "threshold": selected_threshold}


def print_item(name, vals):
    """ print vales for a file """
    print '{:>16} \t {:>8} {:>4} \t {:>8} {:>4} \t {:>8} \t {:-03.3f}'.format(
        basename(name), vals["start"][0], vals["start"][1], vals["end"][0], vals["end"][1], vals["duration"], vals["threshold"])


def write_csv(data, filename):
    """" write data to a csv """
    with open(str(filename), 'wb') as csvfile:
        writer = csv.writer(csvfile, delimiter=',')
        writer.writerow(["file", "leading_silence_ms", "leading_silence_blocks",
                         "trailing_silence_ms", "trailing_silence_blocks", "duration", "dbFS_threshold"])
        for (name, vals) in data:
            writer.writerow(
                [basename(name), vals["start"][0], vals["start"][1], vals["end"][0], vals["end"][1], vals["duration"], vals["threshold"]])


# get files in all directories in a directory
audio_path = "/Users/tyler/Downloads/recordings"
audio_files = []
for root, dirs, files in walk(audio_path):
    for name in files:
        if("wav" in name and "E" in name):
            # only sound files of Experimental items (E for EXP)
            audio_files.append(join(root, name))
    if "prac" in dirs:
        # eliminat practice items
        dirs.remove("prac")
    if "108" in dirs:
        # this is a glitch
        dirs.remove("108")
    if "0" in dirs:
        # this is test data
        dirs.remove("0")
    if "1000" in dirs:
        # this is test data
        dirs.remove("1000")

threshold = -80   # starting dbFS value for detecting silence
step = 0.01  # dbFS increment during search
interval = 50  # ms, increase to speed up

# iterate over all files and find sileces
edge_silences = {a: get_silence(
    join(audio_path, a), interval, threshold, step) for a in audio_files}

# output result and count files with bad values
fails = 0
for name, vals in sorted(edge_silences.items()):
    if vals["start"][0] < 0 or vals["end"][0] < 0:
        # no value found for start and/or end of file
        fails += 1
    print_item(name, vals)
print '\n', str(fails), "failures out of", str(len(edge_silences))

# write results to file
write_csv(edge_silences.items(), "output.csv")