h8rt3rmin8r

JSON.awk

May 17th, 2019
753
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. #!/usr/bin/awk -f
  2. #
  3. # Software: JSON.awk - a practical JSON parser written in awk
  4. # Version: 1.2
  5. # Author: user step- on GitHub.com
  6. # License: This software is licensed under the MIT or the Apache 2 license.
  7. # Project home: https://github.com/step-/JSON.awk.git
  8. # Credits: This software includes major portions of JSON.sh, a pipeable JSON
  9. #   parser written in Bash, retrieved on 20130313
  10. #   https://github.com/dominictarr/JSON.sh
  11. #
  12.  
  13. # See README.md for extended usage instructions.
  14. # Usage:
  15. #   awk [-v Option="value"...] -f JSON.awk "-" -or- Filepath [Filepath...]
  16. #   printf "%s\n" Filepath [Filepath...] | awk [-v Option="value"...] -f JSON.awk
  17. # Options: (default value in braces)
  18. #    BRIEF=: 0 or 1  when 1 don't print non-leaf nodes {1}
  19. #   STREAM=: 0 or 1  when 0 don't output and call externally-defined callbacks.
  20. #      Setting STREAM=0 is intended for custom applications that embed JSON.awk.
  21.  
  22. BEGIN { #{{{
  23.     if (BRIEF == "")  BRIEF=1  # when 1 parse() omits printing non-leaf nodes
  24.     if (STREAM == "") STREAM=1 # when 0 parse() omits stdout and stores jpaths in JPATHS[]
  25.     # for each input file:
  26.     #   TOKENS[], NTOKENS, ITOKENS - tokens after tokenize()
  27.     #   JPATHS[], NJPATHS - parsed data (when STREAM=0)
  28.     # at script exit:
  29.     #   FAILS[] - maps names of invalid files to logged error lines
  30.     delete FAILS
  31.  
  32.     if (1 == ARGC) {
  33.         # file pathnames from stdin
  34.         # usage: echo -e "file1\nfile2\n" | awk -f JSON.awk
  35.         # usage: { echo; cat file1; } | awk -f JSON.awk
  36.         while (getline ARGV[++ARGC] < "/dev/stdin") {
  37.             if (ARGV[ARGC] == "")
  38.                 break
  39.         }
  40.     } # else usage: awk -f JSON.awk file1 [file2...]
  41.  
  42.     # set file slurping mode
  43.     srand(); RS="n/o/m/a/t/c/h" rand()
  44. }
  45. #}}}
  46.  
  47. { # main loop: process each file in turn {{{
  48.     reset() # See important application note in reset()
  49.  
  50.     tokenize($0) # while(get_token()) {print TOKEN}
  51.     if (0 == parse() && 0 == STREAM) {
  52.         # Call back the embedding program passing an array of jpaths.
  53.         cb_jpaths(JPATHS, NJPATHS)
  54.     }
  55. }
  56. #}}}
  57.  
  58. END { # process invalid files {{{
  59.     if (0 == STREAM) {
  60.         # Call back the embedding program passing an associative array
  61.         # of failed objects.
  62.         cb_fails(FAILS, NFAILS)
  63.     }
  64. }
  65. #}}}
  66.  
  67. function get_token() { #{{{
  68. # usage: {tokenize($0); while(get_token()) {print TOKEN}}
  69.  
  70.     # return getline TOKEN # for external tokenizer
  71.  
  72.     TOKEN = TOKENS[++ITOKENS] # for internal tokenize()
  73.     return ITOKENS < NTOKENS
  74. }
  75. #}}}
  76.  
  77. function parse_array(a1,   idx,ary,ret) { #{{{
  78.     idx=0
  79.     ary=""
  80.     get_token()
  81. #scream("parse_array(" a1 ") TOKEN=" TOKEN)
  82.     if (TOKEN != "]") {
  83.         while (1) {
  84.             if (ret = parse_value(a1, idx)) {
  85.                 return ret
  86.             }
  87.             idx=idx+1
  88.             ary=ary VALUE
  89.             get_token()
  90.             if (TOKEN == "]") {
  91.                 break
  92.             } else if (TOKEN == ",") {
  93.                 ary = ary ","
  94.             } else {
  95.                 report(", or ]", TOKEN ? TOKEN : "EOF")
  96.                 return 2
  97.             }
  98.             get_token()
  99.         }
  100.     }
  101.     if (1 != BRIEF) {
  102.         VALUE=sprintf("[%s]", ary)
  103.     } else {
  104.         VALUE=""
  105.     }
  106.     return 0
  107. }
  108. #}}}
  109.  
  110. function parse_object(a1,   key,obj) { #{{{
  111.     obj=""
  112.     get_token()
  113. #scream("parse_object(" a1 ") TOKEN=" TOKEN)
  114.     if (TOKEN != "}") {
  115.         while (1) {
  116.             if (TOKEN ~ /^".*"$/) {
  117.                 key=TOKEN
  118.             } else {
  119.                 report("string", TOKEN ? TOKEN : "EOF")
  120.                 return 3
  121.             }
  122.             get_token()
  123.             if (TOKEN != ":") {
  124.                 report(":", TOKEN ? TOKEN : "EOF")
  125.                 return 4
  126.             }
  127.             get_token()
  128.             if (parse_value(a1, key)) {
  129.                 return 5
  130.             }
  131.             obj=obj key ":" VALUE
  132.             get_token()
  133.             if (TOKEN == "}") {
  134.                 break
  135.             } else if (TOKEN == ",") {
  136.                 obj=obj ","
  137.             } else {
  138.                 report(", or }", TOKEN ? TOKEN : "EOF")
  139.                 return 6
  140.             }
  141.             get_token()
  142.         }
  143.     }
  144.     if (1 != BRIEF) {
  145.         VALUE=sprintf("{%s}", obj)
  146.     } else {
  147.         VALUE=""
  148.     }
  149.     return 0
  150. }
  151. #}}}
  152.  
  153. function parse_value(a1, a2,   jpath,ret,x) { #{{{
  154.     jpath=(a1!="" ? a1 "," : "") a2 # "${1:+$1,}$2"
  155. #scream("parse_value(" a1 "," a2 ") TOKEN=" TOKEN ", jpath=" jpath)
  156.     if (TOKEN == "{") {
  157.         if (parse_object(jpath)) {
  158.             return 7
  159.         }
  160.     } else if (TOKEN == "[") {
  161.         if (ret = parse_array(jpath)) {
  162.             return ret
  163.         }
  164.     } else if (TOKEN == "") { #test case 20150410 #4
  165.         report("value", "EOF")
  166.         return 9
  167.     } else if (TOKEN ~ /^([^0-9])$/) {
  168.         # At this point, the only valid single-character tokens are digits.
  169.         report("value", TOKEN)
  170.         return 9
  171.     } else {
  172.         VALUE=TOKEN
  173.     }
  174.     if (! (1 == BRIEF && ("" == jpath || "" == VALUE))) {
  175.         x=sprintf("[%s]\t%s", jpath, VALUE)
  176.         if(0 == STREAM) {
  177.             JPATHS[++NJPATHS] = x
  178.         } else {
  179.             print x
  180.         }
  181.     }
  182.     return 0
  183. }
  184. #}}}
  185.  
  186. function parse(   ret) { #{{{
  187.     get_token()
  188.     if (ret = parse_value()) {
  189.         return ret
  190.     }
  191.     if (get_token()) {
  192.         report("EOF", TOKEN)
  193.         return 11
  194.     }
  195.     return 0
  196. }
  197. #}}}
  198.  
  199. function report(expected, got,   i,from,to,context) { #{{{
  200.     from = ITOKENS - 10; if (from < 1) from = 1
  201.     to = ITOKENS + 10; if (to > NTOKENS) to = NTOKENS
  202.     for (i = from; i < ITOKENS; i++)
  203.         context = context sprintf("%s ", TOKENS[i])
  204.     context = context "<<" got ">> "
  205.     for (i = ITOKENS + 1; i <= to; i++)
  206.         context = context sprintf("%s ", TOKENS[i])
  207.     scream("expected <" expected "> but got <" got "> at input token " ITOKENS "\n" context)
  208. }
  209. #}}}
  210.  
  211. function reset() { #{{{
  212. # Application Note:
  213. # If you need to build JPATHS[] incrementally from multiple input files:
  214. # 1) Comment out below:        delete JPATHS; NJPATHS=0
  215. #    otherwise each new input file would reset JPATHS[].
  216. # 2) Move the call to apply() from the main loop to the END statement.
  217. # 3) In the main loop consider adding code that deletes partial JPATHS[]
  218. #    elements that would result from parsing invalid JSON files.
  219. # Compatibility Note:
  220. # 1) Very old gawk versions: replace 'delete JPATHS' with 'split("", JPATHS)'.
  221.  
  222.     TOKEN=""; delete TOKENS; NTOKENS=ITOKENS=0
  223.     delete JPATHS; NJPATHS=0
  224.     VALUE=""
  225. }
  226. #}}}
  227.  
  228. function scream(msg) { #{{{
  229.     NFAILS += (FILENAME in FAILS ? 0 : 1)
  230.     FAILS[FILENAME] = FAILS[FILENAME] (FAILS[FILENAME]!="" ? "\n" : "") msg
  231.     if(0 == STREAM) {
  232.         # Call back the embedding program passing the error message,
  233.         # which will be printed to stderr if the callback returns non-zero.
  234.         if(cb_fail1(msg)) {
  235.             print FILENAME ": " msg >"/dev/stderr"
  236.         }
  237.     } else {
  238.         # Print error message when not not embedded.
  239.         print FILENAME ": " msg >"/dev/stderr"
  240.     }
  241. }
  242. #}}}
  243.  
  244. function tokenize(a1,   pq,pb,ESCAPE,CHAR,STRING,NUMBER,KEYWORD,SPACE) { #{{{
  245. # usage A: {for(i=1; i<=tokenize($0); i++) print TOKENS[i]}
  246. # see also get_token()
  247.  
  248.     # POSIX character classes (gawk) - contact me for non-[:class:] notation
  249.     # Replaced regex constant for string constant, see https://github.com/step-/JSON.awk/issues/1
  250. #   ESCAPE="(\\[^u[:cntrl:]]|\\u[0-9a-fA-F]{4})"
  251. #   CHAR="[^[:cntrl:]\\\"]"
  252. #   STRING="\"" CHAR "*(" ESCAPE CHAR "*)*\""
  253. #   NUMBER="-?(0|[1-9][0-9]*)([.][0-9]*)?([eE][+-]?[0-9]*)?"
  254. #   KEYWORD="null|false|true"
  255.     SPACE="[[:space:]]+"
  256.  
  257. #        gsub(STRING "|" NUMBER "|" KEYWORD "|" SPACE "|.", "\n&", a1)
  258.     gsub(/\"[^[:cntrl:]\"\\]*((\\[^u[:cntrl:]]|\\u[0-9a-fA-F]{4})[^[:cntrl:]\"\\]*)*\"|-?(0|[1-9][0-9]*)([.][0-9]*)?([eE][+-]?[0-9]*)?|null|false|true|[[:space:]]+|./, "\n&", a1)
  259.        gsub("\n" SPACE, "\n", a1)
  260.     sub(/^\n/, "", a1)
  261.     ITOKENS=0 # get_token() helper
  262.     return NTOKENS = split(a1, TOKENS, /\n/)
  263. }
  264. #}}}
  265.  
  266. # vim:fdm=marker:
RAW Paste Data