Advertisement
Guest User

Untitled

a guest
Jan 19th, 2017
83
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
text 3.61 KB | None | 0 0
  1. import datacube
  2. from datacube.model import Range
  3. from datetime import datetime
  4. dc = datacube.Datacube(app='dc-example')
  5. import pandas as pd
  6. import time
  7.  
  8. def check_orphan_id(files_on_disk, item, year):
  9. response={}
  10. datasets = dc.index.datasets.search_eager(product=item, time=Range(datetime(year, 1, 1), datetime(year+1, 1, 1)))
  11. # compare datasets to file locations and ids
  12. for i in datasets:
  13. # if no location exists
  14. if i.local_path is None:
  15. response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':'None', 'issue': "location_none"}
  16. elif not i.local_path.exists():
  17. # the location doesn't exist on disk
  18. response[i.id]={'product': item, 'year': year, 'id': i.id, 'local_path':i.local.absolute, 'issue': "location_not_on_disk"}
  19. else:
  20. target = open(str(i.local_path), 'r')
  21. id_on_disk=target.readline()
  22. id_on_disk=id_on_disk.lstrip()
  23. id_on_disk=id_on_disk.rstrip()
  24. # the id of the file at the location does not match the dataset id
  25. if (i.id not in id_on_disk):
  26. # TODO return yaml id at that location
  27. target = open(str(i.local.absolute), 'r')
  28. id_on_disk=target.readline()
  29. id_on_disk = id_on_disk.split(' ')[1]
  30. id_on_disk=id_on_disk.lstrip()
  31. id_on_disk=id_on_disk.rstrip()
  32. response[i.id]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i.local.absolute, 'issue': "id_differs_at_location"}
  33.  
  34. # compare files on disk to entries in datasets
  35. datasets_dict={}
  36. for i in datasets:
  37. datasets_dict[i.id] = {'local_path': i.local_path}
  38. for i in files_on_disk:
  39. target = open(i, 'r')
  40.  
  41. id_on_disk=target.readline()
  42. id_on_disk = id_on_disk.split(' ')[1]
  43. id_on_disk=id_on_disk.lstrip()
  44. id_on_disk=id_on_disk.rstrip()
  45. # check files not on disk for existence as archived dataset
  46. if id_on_disk not in datasets_dict:
  47. if dc.index.datasets.has(id_on_disk):
  48. issue = "disk_id_not_in_db_archived"
  49. else:
  50. issue = "disk_id_not_in_db"
  51. response[id_on_disk]={'product': item, 'year': year, 'id': id_on_disk, 'local_path':i, 'issue': issue}
  52. return(response)
  53.  
  54. indexing_stats = {}
  55. years = list(range(1986,1988))
  56.  
  57. products = ['_pq_scene','_nbar_scene','_nbart_scene', '_level1_scene']
  58. sensors = ['ls5']#,'ls7','ls8']
  59.  
  60. start_time = time.time()
  61. for year in years:
  62. for sensor in sensors:
  63. for item in products:
  64. if item == '_pq_scene':
  65. disk_contents = !ls /g/data/rs0/scenes/pq-scenes-tmp/$sensor/$year/*/output/pqa/*/ga-metadata.yaml
  66. if item == '_nbar_scene':
  67. disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbar/*/ga-metadata.yaml
  68. if item == '_nbart_scene':
  69. disk_contents = !ls /g/data/rs0/scenes/nbar-scenes-tmp/$sensor/$year/*/output/nbart/*/ga-metadata.yaml
  70. if item == '_level1_scene':
  71. disk_contents = !ls /g/data/v10/reprocess/$sensor/level1/$year/*/*/ga-metadata.yaml
  72. item = sensor+item
  73. indexing_stats[item] = check_orphan_id(disk_contents, item, year)
  74.  
  75. print(year, item)
  76.  
  77. print("--- %s seconds ---" % (time.time() - start_time))
  78.  
  79. coherence = pd.DataFrame.from_dict({(i,j): indexing_stats[i][j]
  80. for i in indexing_stats.keys()
  81. for j in indexing_stats[i].keys()},
  82. orient='index')
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement