#!/bin/bash
#
# Copyright 2018 Azul Systems Inc.  All Rights Reserved.
#
# -
#  Sample script that can be used to extract data from ZingVM GC log files
#
#  Usage: @SCRIPT@ -h
#         @SCRIPT@ [-l [<label>]] [-c [<fmt_string>]] < zing.gclog
#
#                  -h   - show this help and exit.
#
#          -l <label>   - gclog line marker (GC, GPGC-H, SYSINFO, OBJCREAT, COMPSTATS, SPS ...)
#                         if <label> is not specified, the script extracts and prints available labels from the
#                         gclog and exits.
#
#     -c <fmt_string>   - specifies what fields to print out
#                         if <fmt_string> is not specified, the script extracts and prints available columns for
#                         the label and exits.
#
#    fmt_string may contain any text.
#    Fields in the form of %{spec} will be replaced with the corresponding field values extracted from the log.
#
#    spec is the name of a field as it appears in the header line. For records that have two-line headers spec can be
#  specified in the form %{h1#h2}, where h1 unambiguously defines the group from the first header line and h2 specifies
#  a field from the second line within that group.
#
#  Examples:
#
#  $ cat ${LOG} | @SCRIPT@ -l GC -c "%{end#end}: Type: %{type}; Mode: %{gc#md}; Pause 2: start=%{pause 2#start} dur=%{pause 2#dur}"
#  $ cat ${LOG} | @SCRIPT@ -l GPGC-H -c "%{Time} %{Model#NewGen};%{KID};%{ReasonString}"
#  $ cat ${LOG} | @SCRIPT@ -l OBJCREAT -c "%{end} %{newGen} %{permGen}"
#
# -

#
#  Scraping GC Log Files
#  ---------------------
#
#  This is a sample implementation of a scrapper that follows these simple rules in order to extract specific fields from a log:
#
#  Each data line is labeled with [<prefix> followed by the number of fields. There is a corresponding header that describes the fields.
#  Headers have labels in the format: [<prefix>H.
#  There can be either one-line or two-line headers.
#
#  1 - words in a multi-word header should be separated with a single space (heap info MB)
#  2 - there should be at lease 2 spaces between headers (pause 2  pause 3)
#  3 - multi-word headers can be used in the 1st line of two-line headers only
#  4 - group header (1-st line) should start at the same position as the first field of the corresponding group (2-nd line).
#     ..  : intercycle               intracycle               : ..
#     ..  : sec alloc-rate perm-rate sec alloc-rate perm-rate : ..
#
#  So parsing became very simple: to find intracycle#alloc-rate, we first look for the 'intracycle' in the first row, then look
#  for the 'alloc-rate' in the second row starting with the position of the found 'intracycle'.
#
#
MYDIR=$(cd `dirname $0`; pwd -P)
MYNAME=$(basename $0)

AWK=${AWK:-$(which gawk 2>/dev/null)}
AWK=${AWK:-$(which awk 2>/dev/null)}
[ -n "${AWK}" ] || fatal "awk not found"

# Print usage and exit
usage() {
  cat "${MYDIR}/${MYNAME}" | ${AWK} '/^# -$/{f++;next}f==2{exit}f==1' | \
      sed -e "s/@SCRIPT@/${MYNAME}/g" -e 's/^#//' 1>&2
  exit
}

# Print the error and exit
fatal() {
  echo "ERROR: $@" 1>&2
  echo "${MYNAME} -h for help" >&2
  exit 1
}

[ $# -ge 1 ] || usage

LABEL=unset

# Parsing arguments
while [ $# -ge 1 ]; do
  case "$1" in
    -h) usage ;;
    -l) shift; [ $# -gt 0 ] && { LABEL="$1"; shift; } || { LABEL=""; break; } ;;
    -c) shift; break ;;
    -*) fatal "Unrecognized option \"$1\"" ;;
  esac
done

[ "${LABEL}" != "unset" ] || fatal "Label param (-l) is not specified"

[ -n "${LABEL}" ] || {
  head -2000 | sed -n 's/.* \[\([A-Z][A-Z-]*\)H .*/\1/p' | egrep -v "GCH|GPGC-$" | sort -u
  exit 0
}

SPEC="$@"

# Do the actual work
${AWK} -v mark="${LABEL}" -v SPEC="${SPEC}" '
  # Get plain data from the line
  function strip_line(line, mark, mark_len) {
    strp_pos = index(line, mark) + mark_len
    return substr(line, strp_pos, length(line) - strp_pos)
  }

  # Remove leading and trailing spaces
  function trim(line) { gsub(/^[ \t]+/, "", line); gsub(/[ \t]+$/, "", line); return line }

  # Print specification for a single group of 2-lines headres
  function out_specs2(h1, h2) {
    h2n = split(h2, h2arr, " *")
    for (k = 1; k <= h2n; k++) {
      fld = trim(h2arr[k])
      if (fld != "") printf("%%{%s#%s}\n", trim(h1), fld)
    }
  }

  # Print specification for two lines headers
  function out_specs(h1, h2) {
    h1n = split(h1, h1arr, "  +")
    i = 1
    cur_h2 = h2
    cur_start = 1
    while (i < h1n) {
      cur_h1 = trim(h1arr[i++])
      nxt_h1 = trim(h1arr[i])

      if (nxt_h1 != "") {
        pos = index(h1, nxt_h1)
        cur_h2 = trim(substr(h2, cur_start, pos - cur_start ))
        cur_start = pos
      } else {
        cur_h2 = trim(substr(h2, cur_start))
      }

      out_specs2(cur_h1, cur_h2)
    }
  }

  # Get the number of a field that starts at the pos position
  function field_number(line, pos) {
    l = substr(line, 1, pos)
    return split(l, f, " +")
  }

  # Parse the header, get specification-to-fields binding
  function resolve_fields(record_fields, record_fields_cnt, time_idx) {
    HDR[1] = " " HDR[1] " "; HDR[2] = " " HDR[2] " "
    for (j = 1; j <= total_fields_cnt; j++) {
      fld = UNRESOLVED_FIELDS[j]
      if (fld == "") continue
      if (fld == "logtime") {
        RESOLVED_FIELDS_IDX[j] = -time_idx
        RESOLVED_FIELDS[j] = UNRESOLVED_FIELDS[j]
        UNRESOLVED_FIELDS[j] = ""
        resolved_fields_cnt++
        continue
      }
      keys = split(fld, key, "#")
      if (keys > hdr_lines) { continue }
      start_idx = key_idx = 1
      if (keys == 2) {
         if ((start_idx = index(HDR[1], " " key[key_idx] " ")) == 0) {
           start_idx = index(HDR[1], key[key_idx])
         }
         key_idx++;
      }
      h = substr(HDR[hdr_lines], start_idx)
      if ((fld_idx = index(h, " " key[key_idx] " ")) == 0) {
        fld_idx = index(h, key[key_idx])
      }
      if (fld_idx > 0) {
        RESOLVED_FIELDS_IDX[j] = field_number(HDR[hdr_lines], fld_idx + start_idx - 1) - 1
        RESOLVED_FIELDS[j] = UNRESOLVED_FIELDS[j]
        UNRESOLVED_FIELDS[j] = ""
        resolved_fields_cnt++
      }
    }

    if (resolved_fields_cnt < total_fields_cnt) {
      for (i = 1; i <= total_fields_cnt; i++) {
        fld = UNRESOLVED_FIELDS[i]
        if (fld != "") {
          print "Unable to resolve field: %{" fld "}" | "cat 1>&2"
          exit 1
        }
      }
      exit
    }
  }

  BEGIN {
    record_mark = "[" mark " "
    header_mark = "[" mark "H "
    record_mark_len = length(record_mark)
    header_mark_len = length(header_mark)
    hdr_lines = 0
    fileds_cnt = 0
    resolved_fields_count = 0

    tail = SPEC
    total_fields_cnt = 0

    # Parse format (SPEC) parameter
    while (tail != "") {
      idx = index(tail, "%{")
      if (idx < 1) { break }
      total_fields_cnt++
      FORMAT[total_fields_cnt] = substr(tail, 1, idx - 1)
      tail = substr(tail, idx + 2)
      idx = index(tail, "}")
      fld = substr(tail, 1, idx - 1)
      tail = substr(tail, idx + 1)
      UNRESOLVED_FIELDS[total_fields_cnt] = fld
    }
  }

  hdr_lines == 0 && $0 ~ "\\" header_mark {
    line = strip_line($0, header_mark, header_mark_len)

    if (header_mark == "[GCH " &&
       line ==          "                         :               heap usage MB               :        gens MB        :        heap info MB         :              pages                 :        intercycle               intracycle        :      gc       :    threads  delay   :  gc    pause 1   pause 2   pause 3   pause 4  : end") {
        HDR[++hdr_lines] = "                         : heap usage MB                             : gens MB               : heap info MB                : pages                              : intercycle               intracycle               : gc            : threads     delay   : gc    pause 1   pause 2   pause 3   pause 4   : end"
    } else if (header_mark == "[COMPSTATSH " && 
       line ~ "Tier FullMethodCompiles OSRMethodCompiles MethodFailToInstall OSRFailToInstall TotalCPUTimeMS TotalWallClockTimeMS TotalWaitInQueueTimeMS *: WaitTimes0ms 10ms 100ms 1000ms 10000ms 100000ms 1000000ms : CompileTimes0ms 10ms 100ms 1000ms 10000ms 100000ms 1000000ms") {
        HDR[++hdr_lines] = "     Counts                                                                    TotalTimes ms                         : WaitTimes Histogram                                               : CompileTimes Histogram"
        HDR[++hdr_lines] = "Tier FullMethodCompiles OSRMethodCompiles MethodFailToInstall OSRFailToInstall CPUTime WallClockTime WaitInQueueTime : 0-9 10-99 100-999 1000-9999 10000-99999 100000-999999 1000000-INF : 0-9 10-99 100-999 1000-9999 10000-99999 100000-999999 1000000-INF"
    } else {
      HDR[++hdr_lines] = line
    }

    getline
    if (index($0, header_mark) > 0) {
      line = strip_line($0, header_mark, header_mark_len)
      HDR[++hdr_lines] = line
      if (SPEC == "") { exit }
      next
    }
    if (SPEC == "") { exit }
  }

  hdr_lines > 0 && SPEC != "" && $0 ~ "\\" record_mark {
    record = strip_line($0, record_mark, record_mark_len)
    gsub(":", " : ", record) # Make sure that delimiter is separated with spaces
    record_fields_count = split(record, fields, " ")

    if (resolved_fields_cnt == 0) {
      if (match($0, "^\\[.*\\] \\[")) { // unified logging
        time_index = 0
      } else {
        time_index = length($1) > 10 ? 2 : 1
      }
      resolve_fields(fields, record_fields_count, time_index)
    }

    for (i = 1; i <= total_fields_cnt; i++) {
       if (RESOLVED_FIELDS[i] != "") {
         idx = RESOLVED_FIELDS_IDX[i]
         if (idx == 0) { # unified log logtime
           for (d = 2; d <= split($1, flds, "[][]"); d += 2) {
             if (match(flds[d], "^[0-9]+-[0-9]+-[0-9]+T[0-9]+:[0-9]+:[0-9]+")) {
               break
             }
             if (match(flds[d], "s$")) {
               break
             }
           }
           printf("%s%s", FORMAT[i], flds[d])
         } else {
           printf("%s%s", FORMAT[i], idx > 0 ? fields[idx] : substr($(-idx), 0, length($(-idx)) - 1))
         }
       }
    }

    print tail
  }

  END {
    # If no SPEC was given - out available specifications extracted from the specified header
    if (SPEC == "" && hdr_lines > 0) {
      print "%{logtime}"
      if (hdr_lines == 2) {
        gn = split(HDR[1] " ", arr1, ":")
        gn = split(HDR[2] " ", arr2, ":")
        for (gidx = 1; gidx <= gn; gidx++) {
          out_specs(arr1[gidx] " ", arr2[gidx] " ")
        }
      } else {
        gn = split(HDR[1], arr1, "  *")
        for (gidx = 1; gidx <= gn; gidx++) {
          fld = trim(arr1[gidx])
          if (fld != "" && fld != ":") printf("%%{%s}\n", fld)
        }
      }
    }
  }
'

