SHARE
TWEET

Untitled

a guest Dec 6th, 2019 105 Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/bin/bash
  2.  
  3. # This script can be used to use an existing dictionary to produce a pair of dictionary and query files that have certain properties
  4. # The are 7 fixed required inputs in order are:
  5. #   path to existing dictionary file
  6. #   name of output dictionary file (will be overwritten)
  7. #   name of output query file (will be overwritten)
  8. #   length of desired output dictionary
  9. #   length of desired number of queries
  10. #   sorting option; one of 'sorted', 'reverse', or 'random'
  11. #   the percentage of queries that should be in the dictionary
  12. # For the last 4 options you can write '-' for the default value
  13. #
  14. # Note that his creates a temporary file that 'explodes' the dictionary to one word per line
  15. #
  16. # You may modify this file in any way that you wish to update the way in which it generates output files
  17. #
  18. # Hint: for a fully automated experiment you probably want to write another script that calls this script with multiple
  19. #       inputs, executes the necessary tests, and records all the running times
  20. #
  21. # author: Giles, 2019
  22.  
  23. if [ $# -ne 7 ]
  24. then
  25.   echo "Call with dictionary input file, dictionary output file, query output file, dictionary length, query length, sorting option, query hit percent option"
  26.   exit
  27. fi
  28.  
  29. dict=$1
  30. dict_out=$2
  31. query_out=$3
  32. dict_len=$4
  33. query_len=$5
  34. dict_sorting=$6
  35. query_hit_percent=$7
  36.  
  37. exploded_dict="${dict}_exploded"
  38.  
  39. echo "Exploding dictionary to put one word on each line"
  40. xargs -n 1 < $dict | sort -u > $exploded_dict
  41.  
  42. read actual_dict_len f <<< $(wc -l $exploded_dict)
  43.  
  44. echo "$actual_dict_len words in dictionary"
  45.  
  46. if [ $dict_len = "-" ]
  47. then
  48.   dict_len=$actual_dict_len
  49. fi
  50.  
  51. if $(( $dict_len > $actual_dict_len ))
  52. then
  53.   echo "We cannot create new words, dictionary length must be less than actual"
  54.   exit
  55. fi
  56.  
  57. if (( $query_hit_percent < 0 )) || (( $query_hit_percent > 100 ))
  58. then
  59.   echo "query hit percent must be between 0 and 100"
  60.   exit
  61. fi
  62.  
  63. query_hit=$(( $query_hit_percent * $query_len / 100 ))
  64. query_miss=$(( $query_len - query_hit ))
  65. echo "$query_hit query hits required"
  66.  
  67. if (( $query_hit < 100 )) && [ $dict_len -eq $actual_dict_len ]
  68. then
  69.   echo "to get a query hit percentage less than 100 we need some words not selected from the dictionary to use for the misses. Set required dictionary length to less than actual length."
  70.   exit
  71. fi
  72.  
  73. if [[ $dict_sorting = "sorted" ]]
  74. then
  75.   echo "Sorting dictionary"
  76.   head -n $dict_len $exploded_dict | sort -d > $dict_out
  77. elif [[ $dict_sorting = "reverse" ]]
  78. then
  79.   echo "Reverse sorting dictionary"
  80.   head -n $dict_len $exploded_dict | sort -dr > $dict_out
  81. elif [[ $dict_sorting = "random" ]]
  82. then
  83.   echo "Randomising dictionary"
  84.   head -n $dict_len $exploded_dict |  awk 'BEGIN{srand()}{printf "%06d %s\n", rand()*1000000, $0;}' | sort -n | cut -c8- > $dict_out
  85. else
  86.   echo "No sorting performed"
  87.   head -n $dict_len $exploded_dict > $dict_out
  88. fi
  89.  
  90. rm $query_out
  91.  
  92. echo "Computing query hits"
  93. while (( $query_hit > 0 ))
  94. do
  95.   take=$(( $query_hit > $dict_len ? $dict_len : $query_hit ))
  96.   head -n $take $dict_out >> $query_out
  97.   query_hit=$(( $query_hit - $take ))
  98. done
  99.  
  100. echo "Computing query misses"
  101. leftover=$(( $actual_dict_len - $dict_len ))
  102. while (( $query_miss > 0 ))
  103. do
  104.   take=$(( $query_miss > $leftover ? $leftover : $query_miss ))
  105.   tail -n $take $exploded_dict >> $query_out
  106.   query_miss=$(( $query_miss - $take ))
  107. done
RAW Paste Data
We use cookies for various purposes including analytics. By continuing to use Pastebin, you agree to our use of cookies as described in the Cookies Policy. OK, I Understand
 
Top