Untitled

import datacube
from datacube.model import Range
from datetime import datetime
dc = datacube.Datacube(app='dc-example')
import pandas as pd
import time

def check_orphan_id(files_on_disk, item, year):
    response={}
    datasets = dc.index.datasets.search_eager(product=item, time=Range(datetime(year, 1, 1), datetime(year+1, 1, 1)))
    # compare datasets to file locations and ids
    for i in datasets:
        # if no location exists
        if i.local_path is None:
            response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':'None', 'issue': "location_none"}
        elif not i.local_path.exists():
            # the location doesn't exist on disk
            response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':i.local.absolute, 'issue': "location_not_on_disk"}
        else:
            target = open(str(i.local_path), 'r')
            id_on_disk=target.readline()
            id_on_disk=id_on_disk.lstrip()
            id_on_disk=id_on_disk.rstrip()
            # the id of the file at the location does not match the dataset id
            if (i.id not in id_on_disk):
                # TODO return yaml id at that location
                target = open(str(i.local.absolute), 'r')
                id_on_disk=target.readline()
                id_on_disk = id_on_disk.split(' ')[1]
                id_on_disk=id_on_disk.lstrip()
                id_on_disk=id_on_disk.rstrip()
                response[i.id]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i.local.absolute, 'issue': "id_differs_at_location"}

    # compare files on disk to entries in datasets
    datasets_dict={}
    for i in datasets:
        datasets_dict[i.id] = {'local_path': i.local_path}
    for i in files_on_disk:
        target = open(i, 'r')

        id_on_disk=target.readline()
        id_on_disk = id_on_disk.split(' ')[1]
        id_on_disk=id_on_disk.lstrip()
        id_on_disk=id_on_disk.rstrip()
        # check files not on disk for existence as archived dataset
        if id_on_disk not in datasets_dict:
            if dc.index.datasets.has(id_on_disk):
                issue = "disk_id_not_in_db_archived"
            else:
                issue = "disk_id_not_in_db"
            response[id_on_disk]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i, 'issue': issue}
    return(response)

    indexing_stats = {}
years = list(range(1986,1988))

products = ['_pq_scene','_nbar_scene','_nbart_scene', '_level1_scene']
sensors = ['ls5']#,'ls7','ls8']

start_time = time.time()
for year in years:
    for sensor in sensors:
        for item in products:
            if item == '_pq_scene':
                disk_contents = !ls /g/data/rs0/scenes/pq-scenes-tmp/$sensor/$year/*/output/pqa/*/ga-metadata.yaml
            if item == '_nbar_scene':
                disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbar/*/ga-metadata.yaml
            if item == '_nbart_scene':
                disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbart/*/ga-metadata.yaml
            if item == '_level1_scene':
                disk_contents =  !ls /g/data/v10/reprocess/$sensor/level1/$year/*/*/ga-metadata.yaml
            item = sensor+item
            indexing_stats[item] = check_orphan_id(disk_contents, item, year)

            print(year, item)

print("--- %s seconds ---" % (time.time() - start_time))

coherence = pd.DataFrame.from_dict({(i,j): indexing_stats[i][j]
                           for i in indexing_stats.keys()
                           for j in indexing_stats[i].keys()},
                       orient='index')