GED_UID.fix.awk

- #! /usr/bin/env awk -f 
  ###############################################################################
  ##
  ## USAGE: [g|m|n]awk [-v FIX=0|1] [-v STY=<targetformat>] -f GED_UID.fix.awk [<]infile.GED [>outfile.GED]
  ## NOTES:
  ##  list of actions executed for GEDCOM-files ...
  ##
  #~  search _UID-tag-strings for patterns of 16 (or +2 for checksum) octet-sequences in hexadigit notation 
  #~  ignore (i.e. accept) several other characters common for various UUID-formats/standards (delimiters, separators, prefixes)
  #~  if pattern-matching succeeds
  #~      transform the leading 16 octets into numbers representing the significant 128-bit-value of a UUID
  #~      compute a 2-octet checksum (PAF-GEDCOM-_UID-algorithm), transform the checksum into a 4-hexadigit string
  #~      compose a new UUID-representation in <targetformat> from source-octets (accordingly plus new checksum)
  #~      compare source-string of UUID-representation (incl. all surplus characters) with target-string/format
  #~  if pattern-matching fails
  #~      create a new UUID-string in <targetformat>
  #~  if FIX==boolean(true) replace source-string with target-string, output whole (fixed) GEDCOM-file
  #~  if FIX==boolean(false) output source GEDCOM_UID-line followed by newline computed/compared UUID
  #~      ___1-prefix-tag : <sourceformat> and <targetformat> are identical ("true")
  #~      ___X-prefix-tag : different formats, but significant 128-bit-value preserved in valid <targetformat>
  #~      ___0-prefix-tag : no match for any UUID-128-bit-value, new 128-bit generated in valid <targetformat>
  ##  
  ## OPTIONS:
  ##
  ## -v FIX=0|1
  ##    0 evaluates to false : (default) check _UID-tags for conformance with <targetformat>, output comparison
  ##    1 evaluates to true  : transform ("fix") _UID-tags value into <targetformat>, output fixed GEDCOM-file
  ##
  ## -v STY="_UID"|"GUID"|"UUID"|"XUID"|"UURN"|"XURN"|<targetformat>
  ##   "_UID" XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCCCC (default)
  ##          PAF-GEDCOM-_UID 16+2 bytes, 36 chars uppercase hexdigit with checksum
  ##   "UUID" xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
  ##          RFC-4122-UUIDv4 16 bytes, 32+4 chars lowercase hexdigit hyphen-grouped
  ##   "GUID" {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX}
  ##          embraced {UUIDv4} 16 bytes, 32+6 chars uppercase hexdigit hyphen-grouped
  ##   "XUID" {XXXxXXxx-XXxX-XxXx-Xxxx-xxXXxXXxXXxx}cccc
  ##          extended mixedcase and -style {GUIDv4}, 4-hexdigit checksum appended
  ##   "UURN" urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
  ##          prefixed lowercase "urn:uuid:UUIDv4" (RFC-4122, UUID as URN)
  ##   "XURN" urn:uuid:xXXxxXXx-XXxx-XXxx-XXXX-xXxxXXxXxxxX+cccc
  ##          extended mixedcase "urn:uuid:UUIDv4+checksum" (RFCs 2141+3986+4122)
  ##    else: XXXXxxxx-XXxX-XxXX-XXXX-XXXxXxXxxxXx cccc
  ##          combined mixedcase UUIDv4 with 4-hexdigit checksum (set apart)
  ##
  #~ New (self-) generated UUIDs are always of RFC-4122 random type v4, 
  #~ independent of a grouped or straight format. Divergent from standard, 
  #~ the generator outputs randomly mixed-case letters. The non-standard 
  #~ XUID-, XURN- and fallback-targetformats (if user's choice of "format" is an 
  #~ undefined token) are case-preserving, but easy to convert. 
  #~
  #~ Given valid source-values (and their notation-fragments) take precedence 
  #~ over generated values. Joint with the case-preserving XUID-, XURN- and 
  #~ fallback-targetformats, a mixed-case output may result from the source 
  #~ (copy of case) or the generator (randomly mixed case). But as long as 
  #~ vendors do not provide an algorithm of creation, mixedcase source-UUIDs 
  #~ are not really comparable at string-level. The patterns are most likely 
  #~ always different and recommended for change. Beyond that, the lettercase 
  #~ is not recoverable after a normalization or change of format. 
  ##
  ###############################################################################
  ##
  #~ RFC-2141, URN Syntax
  #~
  #~ Some namespaces may define additional lexical equivalences, such as
  #~ case-insensitivity of the NSS (or parts thereof).
  #~
  #~ RFC-4122, A Universally Unique IDentifier (UUID) URN Namespace
  #~
  #~ The internal representation of a UUID is a specific sequence of
  #~ bits in memory, [...].  To accurately
  #~ represent a UUID as a URN, it is necessary to convert the bit
  #~ sequence to a string representation.
  #~
  #~ Each field is treated as an integer and has its value printed as a
  #~ zero-filled hexadecimal digit string with the most significant
  #~ digit first.  The hexadecimal values "a" through "f" are output as
  #~ lower case characters and are case insensitive on input.
  ##
  ###############################################################################
  ##
  ## 2013 ~ Stefan Unterstein <http://unterstein.net/ged1212xml>
  ##
  ## This program is free software ("freeware"): 
  ## you can redistribute it and/or modify it as you like.
  ##
  ## The program is distributed in the hope that it will be useful,
  ## but WITHOUT ANY WARRANTY; without even the implied warranty of
  ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
  ##
  ###############################################################################
  
- BEGIN {
      
      FIX = FIX ? !!FIX : 0 ;  
      STY = (STY~/^([_GUX]UID|[UX]URN)$/) ? STY : ((STY~/^$/) ? "_UID" : "") ;
-     #
      #  FIX default = 0 = false = check only 
      #              = output given vs computed _UIDs only
      #
      #  awk -f GED_UID.fix.awk infile.ged | grep -B 1 "___[0X]"
      #
      #  will filter flawed _UIDs and computed replacements
      #
      #  awk -v FIX=1 -f GED_UID.fix.awk inflawed.ged > outfixed.ged
      #
      #  will fix 'em (to PAF-compatible _UIDs)
      #
      #  awk -v FIX=1 -v STY=UUID -f GED_UID.fix.awk inflawed.ged > outfixed.ged
      #
      #  will fix 'em (to RFC-compatible UUIDs)
      #
      
      mkXB2N(xbyte); split("01234567cdef89ab89AB01234567CDEF",xchar,"");  
-     #
      # make xbyte an array of HexDigit-Byte-(zero-filled)-Indices-to-Number
      #      xbyte["00"]=0 xbyte["01"]=1 .. "ff"="Ff"="fF"="FF"=255
      #
      # xchar for UUIDv4 = xxxxxxxx-xxxx-4xxx-Yxxx-xxxxxxxxxxxx
      #
      # usage lower case:
      #   x = xchar[int(rand()*16+1)]
      #   y = xchar[int(rand()*4+13)]
      #
      # usage mixed case:
      #   x = xchar[int(rand()*32+1)]
      #   y = xchar[int(rand()*8+13)]
      #
      # usage upper case:
      #   x = xchar[int(rand()*16+17)]
      #   y = xchar[int(rand()*4+17)]
      #
      
      Hx01RE = "[0-9a-fA-F]";
      Hx02RE = Hx01RE Hx01RE; # octet/byte
      Hx04RE = Hx02RE "-?" Hx02RE;
      Hx08RE = Hx04RE "-?" Hx04RE;
      Hx12RE = Hx04RE "-?" Hx04RE "-?" Hx04RE;
      chksRE = "([- +]?" Hx04RE ")?"
      xuidRE = "{?" Hx08RE "-?" Hx04RE "-?" Hx04RE "-?" Hx04RE "-?" Hx12RE "}?" chksRE;
      xurnRE = "([uU][rR][nN]:[uU][uU][iI][dD]:)?" xuidRE;
-     # 
      #  captures GUIDs, UUIDs, _UIDs, URNs prefix, with or w/o plus|minus|space checksum, any lettercase, any hyphen-byte-grouping
      #> marks output- or replacement-formats, four of them canonical or quasi-standards
      # 
      #   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
      #>  XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCCCC
      #  {xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx}
      #  {xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx}cccc
      #>  xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
      #   xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxxcccc
      #> {XXXXXXXX-XXXX-XXXX-XXXX-XXXXXXXXXXXX}
      #> {XxXXXxXX-xxxX-XXXx-XxxX-XxXXxxxXXXxX}cccc
      #> urn:uuid:xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx
      #> urn:uuid:XXxXXXXx-XXxX-xXxx-Xxxx-xxxxXxxXXxxX+cccc
      #
      #  ... any hyphen-byte-grouping from none to all (grouping half-byte "nibbles" doesn't make any sense to me)
      #
      #   xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx
      #   xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx cc-cc
      #  {xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx-xx}�cc-cc
      #
  }
  
- ###############################################################################
  ###############################################################################
  
- $1 ~ /[0-9]/ && $2 == "_UID" {
      
      gvnUUID = match($3,xurnRE) ? substr($3,RSTART,RLENGTH) : "" ;
      if (gvnUUID)
-     {
-         #~ valUUID = (length(gvnUUID)>=32) ? substr(gvnUUID,1,32) : "" ;
          #~ chkUUID = (length(gvnUUID)==36) ? substr(gvnUUID,33,4) : "" ;
          cmpUUID = uuid4matter(gvnUUID,STY);  
          if ($0 == ($1 " " $2 " " cmpUUID))
-         {
              $0 = (FIX) ? $0 : ($0 "\n" $1+1 " ___1 " cmpUUID) ;  
-             #
              #  true <targetformat> and value, comp'd and given ID+checksum are identical
              #  if <targetformat>==_UID (default), value and format are likely to be accepted by PAF-compatibles
              #
          } else {
              $0 = ((FIX) ? ($1 " " $2 " ") : ($0 "\n" $1+1 " ___X ")) cmpUUID ;  
-             #
              #  true UUID 128-bit value, but false format or checksum, or surplus characters 
              #  value now preserved and transformed into <targetformat>, accordingly plus new checksum 
              #  if <targetformat>==_UID and not eXchanged, this and next are likely to be rejected by PAF-compatibles
              #
          }                           
      } else {
          $0 = ((FIX) ? ($1 " " $2 " ") : ($0 "\n" $1+1 " ___0 ")) uuid4matter(mkUUID(),STY) ;  
-         #
          #  false, no (valid) UUID or 128-bit-value available, new UUID in <targetformat> generated
          #
      }
      if (!FIX) print;
  }
  
  ###############################################################################
  
  FIX { print; }
      
- ###############################################################################
  ###############################################################################
  # functions
  ###############################################################################
  
  function mkUUID(    UUID)  #  31 rand() per UUID, miXed case; depends on global xchar[]
- {
      UUID = "xxxxxxxx-xxxx-4xxx-" xchar[int(rand()*8+13)] "xxx-xxxxxxxxxxxx";
      while(sub(/x/,xchar[int(rand()*32+1)],UUID));
      return UUID;
  }
  
  function mkXB2N(a,  i,j,x,X,n)  # make HexDigit-Byte-(zero-filled)-to-Number Array
- {
      split("0123456789abcdef",x,""); split("0123456789ABCDEF",X,""); n=0;
      for (i=1; i<17; i++)
-     {
          for (j=1; j<17; j++)
-         {
              a[x[i]""x[j]]=a[x[i]""X[j]]=a[X[i]""x[j]]=a[X[i]""X[j]]=n++; 
          }
      }
  }
  
  function uuid4matter(UUID,fmt,  BytesSum1,BytesSum2,ChecksHex,CanonUUID,n) 
- {
      gsub(/([uU][rR][nN]:[uU][uU][iI][dD]:)|[-{ }+]/,"",UUID); UUID = substr(UUID,1,32);
      for (n=1; n<17; n++) 
-     {
          BytesSum1 += xbyte[substr(UUID,n*2-1,2)]; # mkXB2N(xbyte); # xbyte["00"]=0 xbyte["01"]=1 .. "ff"="Ff"="fF"="FF"=255
          BytesSum2 += BytesSum1;
      }
      ChecksHex = sprintf("%02x%02x",BytesSum1 % 256,BytesSum2 % 256);
      CanonUUID = substr(UUID,1,8) "-" substr(UUID,9,4) "-" substr(UUID,13,4) "-" substr(UUID,17,4) "-" substr(UUID,21,12);
      
-     if (fmt=="_UID") {
          return toupper(UUID ChecksHex);
      } else if (fmt=="GUID") {
          return "{" toupper(CanonUUID) "}";
      } else if (fmt=="UUID") {
          return tolower(CanonUUID);
      } else if (fmt=="XUID") {
          return "{" CanonUUID "}" ChecksHex;
      } else if (fmt=="UURN") {
          return "urn:uuid:" tolower(CanonUUID);
      } else if (fmt=="XURN") {
          return "urn:uuid:" CanonUUID "+" ChecksHex;
      } else return CanonUUID " " ChecksHex;
  }
  
- ###############################################################################
  # EOF 
  ###############################################################################