Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- import datacube
- from datacube.model import Range
- from datetime import datetime
- dc = datacube.Datacube(app='dc-example')
- import pandas as pd
- import time
- def check_orphan_id(files_on_disk, item, year):
- response={}
- datasets = dc.index.datasets.search_eager(product=item, time=Range(datetime(year, 1, 1), datetime(year+1, 1, 1)))
- # compare datasets to file locations and ids
- for i in datasets:
- # if no location exists
- if i.local_path is None:
- response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':'None', 'issue': "location_none"}
- elif not i.local_path.exists():
- # the location doesn't exist on disk
- response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':i.local.absolute, 'issue': "location_not_on_disk"}
- else:
- target = open(str(i.local_path), 'r')
- id_on_disk=target.readline()
- id_on_disk=id_on_disk.lstrip()
- id_on_disk=id_on_disk.rstrip()
- # the id of the file at the location does not match the dataset id
- if (i.id not in id_on_disk):
- # TODO return yaml id at that location
- target = open(str(i.local.absolute), 'r')
- id_on_disk=target.readline()
- id_on_disk = id_on_disk.split(' ')[1]
- id_on_disk=id_on_disk.lstrip()
- id_on_disk=id_on_disk.rstrip()
- response[i.id]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i.local.absolute, 'issue': "id_differs_at_location"}
- # compare files on disk to entries in datasets
- datasets_dict={}
- for i in datasets:
- datasets_dict[i.id] = {'local_path': i.local_path}
- for i in files_on_disk:
- target = open(i, 'r')
- id_on_disk=target.readline()
- id_on_disk = id_on_disk.split(' ')[1]
- id_on_disk=id_on_disk.lstrip()
- id_on_disk=id_on_disk.rstrip()
- # check files not on disk for existence as archived dataset
- if id_on_disk not in datasets_dict:
- if dc.index.datasets.has(id_on_disk):
- issue = "disk_id_not_in_db_archived"
- else:
- issue = "disk_id_not_in_db"
- response[id_on_disk]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i, 'issue': issue}
- return(response)
- indexing_stats = {}
- years = list(range(1986,1988))
- products = ['_pq_scene','_nbar_scene','_nbart_scene', '_level1_scene']
- sensors = ['ls5']#,'ls7','ls8']
- start_time = time.time()
- for year in years:
- for sensor in sensors:
- for item in products:
- if item == '_pq_scene':
- disk_contents = !ls /g/data/rs0/scenes/pq-scenes-tmp/$sensor/$year/*/output/pqa/*/ga-metadata.yaml
- if item == '_nbar_scene':
- disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbar/*/ga-metadata.yaml
- if item == '_nbart_scene':
- disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbart/*/ga-metadata.yaml
- if item == '_level1_scene':
- disk_contents = !ls /g/data/v10/reprocess/$sensor/level1/$year/*/*/ga-metadata.yaml
- item = sensor+item
- indexing_stats[item] = check_orphan_id(disk_contents, item, year)
- print(year, item)
- print("--- %s seconds ---" % (time.time() - start_time))
- coherence = pd.DataFrame.from_dict({(i,j): indexing_stats[i][j]
- for i in indexing_stats.keys()
- for j in indexing_stats[i].keys()},
- orient='index')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement