OS/2 REXX or NT ooREXX: convert UTF-8 to SBCS and vice versa

Автор: Frank Ellermann

Иcточник: https://omniplex.om.funpic.de/

/* OS/2 REXX or NT ooREXX: convert UTF-8 to SBCS and vice versa.  */

/* The REXX source is designed for SBCS codepages based on ASCII, */

/* anything else (EBCDIC, DBCS, UTF-7, etc.) won't work.          */

 

/* This is only the code and a small test suite, copy procedures  */

/* UTF.I (UTF-8 to local), UTF.O (local to UTF-8), and UTF.8 to a */

/* a script needing UTF-8 conversions.                            */

 

/* UTF.I( x, cp ) decodes an UTF-8 string x for codepage cp.      */

/* UTF.O( x, cp ) encodes a codepage cp string x into UTF-8.      */

/* The 2nd argument cp can be omitted after it was initialized.   */

 

/* UTF. is a global variable exposed by UTF.I() and UTF.O(), it   */

/* is reinitialized if a 2nd argument for UTF.I() or UTF.O() does */

/* not match the last used local codepage.                        */

 

/* History - see also <URL:https://purl.net/xyzzy/src/utf-8.cmd> : */

/* 0.1 - added codepage 437 using <URL:https://www.eki.ee/letter/> */

/*     - obvious bug in UTF.I() REXX positional parsing fixed :-( */

/* 0.2 - avoid syntax trap in UTF.I() for invalid UTF.8 strings   */

/*     - added UTF.I() test cases for nine invalid UTF-8 strings  */

/*     - moved old tests to procedure DEBUG, two codepages tested */

/*     - use OS/2 SysQueryProcessCodePage() directly (+ comments) */

/* 0.3 - 80..BF now "eat" only 1 byte, shown as one unknown char. */

/*     - C0..C1 still "eat" 2 bytes, shown as 1 unknown character */

/*     - F5..F7 still "eat" 4 bytes (F5..FD illegal for RfC 3629) */

/*     - F8..FB still "eat" 5 bytes (F5..FD unused for ISO 10646) */

/*     - FC..FD still "eat" 6 bytes (F5..FD allow 2**31 Unicodes) */

/*     - FE..FF now "eat" only 1 byte, shown as one unknown char. */

/*     - added tests EF BB BF (u+FEFF BOM) and C0 AF (bad 2F '/') */

/*     - bad / unknown / unsupported character shown as UTF.? set */

/*       by UTF. = '?', any US-ASCII character could be used      */

/* 0.4 - bug fix for windows-1252 (OS/2 1004) 8D, 8E, 9D, 9E, 9F  */

/* 0.5 - bug fix for invalid u+4000000 encoding as FC84 8080 8080 */

/*       etc. only used in <https://purl.net/xyzzy/kex/x-wiki.kex> */

/* 0.6 - SysQueryProcessCodePage() removed:  UTF.I() and UTF.O()  */

/*       now expect a 2nd argument specifying the local codepage  */

/* 0.7 - replaced UTF-8 prose explanation by simple CharMapML     */

/*     - replaced '?' by ASCII SUB (0x1A) for unmapped char.s     */

/*     - added Latin-9 and MacRoman; explicit Latin-1, no default */

/* 0.8 - added ibm-878 (KOI8-R) for the Russian OS/2 community    */

/* 0.9 - renamed 'MAC' Roman to '10000' (the number used on W2K)  */

/*       added '28591' as alias of '819' for ISO 8859-1           */

/*       added '28605' as alias of '923' for ISO 8859-15          */

/*       Various not yet supported W2K codepages to complete the  */

/*       already implemented Latin-1 and Cyrillic variants, plus  */

/*       some obscure W2K codepages noted here "while I'm at it": */

/*         855:  OEM Cyrillic                                     */

/*         866:  OEM Russian                                      */

/*        1251: ANSI Cyrillic, presumably 28595 excl. C1 controls */

/*       10017:  MAC Cyrillic                                     */

/*       28593:  IS0 8859-3 (Latin-3, Esperanto)                  */

/*       28595:  IS0 8859-5 (Cyrillic)                            */

/*       28599:  IS0 8859-9 (Latin-5)                             */

/*       65001: UTF-8        ToDo: find IBM UTF-8 codepage number */

/*       20127: US-ASCII     ToDo: figure out what US-ASCII is... */

/*       20105:  IA5 IRV     ToDo: allow pure 7bit US-ASCII input */

/*       20106:  IA5 German (out of scope, noted for reference)   */

/*       20261: T.61                         ToDo: what is this ? */

/*       20269:  ISO 6937 non-spacing accent (out of scope)       */

/*       21027:  Ext Alpha lower case        ToDo: what is this ? */

/* --------------------------------------------------------------

<?xml version="1.0" encoding="US-ASCII" ?>

<!DOCTYPE characterMapping SYSTEM

 "https://www.unicode.org/reports/tr22/CharacterMapping.dtd">

 

<characterMapping

 id="utf-8"

 version="1"

 description="Based on the UTF-8 example in UTS #22"

 normalization="neither">

<validity>

    <state type="FIRST" next="VALID"    s="00" e="7F" />

    <state type="FIRST" next="T1"       s="C2" e="DF" />

    <state type="FIRST" next="LE0"      s="E0"        />

    <state type="FIRST" next="T2"       s="E1" e="EC" />

    <state type="FIRST" next="LED"      s="ED"        />

    <state type="FIRST" next="T2"       s="EE" e="EF" />

    <state type="FIRST" next="LF0"      s="F0"        />

    <state type="FIRST" next="T3"       s="F1" e="F3" />

    <state type="FIRST" next="LF4"      s="F4"        />

 

    <state type="T1"    next="VALID"    s="80" e="BF" />

    <state type="T2"    next="T1"       s="80" e="BF" />

    <state type="T3"    next="T2"       s="80" e="BF" />

 

    <state type="LE0"   next="T1"       s="A0" e="BF" />

    <state type="LED"   next="T1"       s="80" e="9F" />

 

    <state type="LF0"   next="T2"       s="90" e="BF" />

    <state type="LF4"   next="T2"       s="80" e="8F" />

</validity>

<assignments  sub="EF BF BD">

    <range bFirst="00"              bLast="7F"

             bMin="00"               bMax="7F"

           uFirst="0000"            uLast="007F"        />

    <range bFirst="C2 80"           bLast="DF BF"

             bMin="C2 80"            bMax="DF BF"

           uFirst="0080"            uLast="07FF"        />

    <range bFirst="E0 A0 80"        bLast="ED 9F BF"

             bMin="E0 80 80"         bMax="ED BF BF"

           uFirst="0800"            uLast="D7FF"        />

    <range bFirst="EE 80 80"        bLast="EF BF BF"

             bMin="EE 80 80"         bMax="EF BF BF"

           uFirst="E000"            uLast="FFFF"        />

    <range bFirst="F0 90 80 80"     bLast="F4 8F BF BF"

             bMin="F0 80 80 80"      bMax="F4 BF BF BF"

           uFirst="10000"           uLast="10FFFF"      />

</assignments>

</characterMapping>

   -------------------------------------------------------------- */

 

   signal on  novalue  name TRAP    ;  signal on  syntax name TRAP

   signal on  failure  name TRAP    ;  signal on  halt   name TRAP

 

   if UTF.O( /**/, 819 ) \== ''  then  exit TRAP( 'init. Latin-1' )

 

   U = x2c( 77 66 55 44 33 22 )     /* up to 5 char.s "eaten" by  */

   do N = 0 to 8                    /* test invalid UTF-8 strings */

      C = x2c( 22 || b2x( left( copies( 1, N ), 8, 0 ))) || U

      if N = 0 then  C = x2c( '22 EF BB BF 22 C0 AF 22' )

      say 'bad UTF-8' c2x( C ) '=>' c2x( UTF.I( C )) UTF.I( C )

   end N

 

   Q = '437 858 1252 819 923 878 10000'

   do W = 1 to words( Q )

      CP = word( Q, W )

      select

         when  CP =   437  then  P = '( US PC DOS)   437:'

         when  CP =   858  then  P = '( OS/2  850)   858:'

         when  CP =  1252  then  P = '( OS/2 1004)  1252:'

         when  CP =   819  then  P = '(ISO 8859-1)   819:'

         when  CP =   923  then  P = '(ISO 8859-15)  923:'

         when  CP =   878  then  P = '( KOI8-R   )   878:'

         when  CP = 10000  then  P = '(MAC Roman ) 10000:'

         otherwise               P = right( CP, 18 ) || ':'

      end

      say P DEBUG( CP )

   end W

   exit 0

 

DEBUG:   procedure

   do N = 0 to 255                  /* check 256 local characters */

      C = centre( d2c( N ), 3 )     ;  U = UTF.O( C, arg( 1 ))

      if UTF.I( U ) == C then iterate N

      say 'error at' N              ;  trace ?R

      U = UTF.O( C )                ;  call UTF.I U

      say result == C               ;  return 'fail'

   end N

 

   U = 128                          /* find 128 UTF-8 characters: */

   do N = U to 65535 until U = 256

      B = reverse( x2b( d2x( N )))  ;  C = ''

 

      do L = 2 until verify( substr( B, 8 - L ), 0 ) = 0

         C = C || left( B, 6, 0 ) || 01

         B = substr( B, 7 )

      end L

 

      B = C || left( B, 8 - L, 0 ) || copies( 1, L )

      C = x2c( b2x( reverse( B )))

      U = U + ( UTF.I( C ) <> UTF.? )

   end N                            /* test error character UTF.? */

 

   N = 'found' U 'of 256 SBCS characters up to u+' || d2x( N, 4 )

   if U = 256  then  return 'okay,' N

               else  return 'fail,' N

 

/* -------------------------------------------------------------- */

/* <URL:https://purl.net/xyzzy/src/utf-8.cmd> 0.8, (c) F.Ellermann */

 

UTF.I:   procedure expose UTF.      /* UTF-8 to local charset     */

   parse arg SRC  ;  DST = ''       ;  UTF.8 = UTF.8( arg( 2 ))

 

   do while SRC <> ''

      POS = verify( SRC, UTF.8 ) -1 ;  if POS < 0 then leave

      DST = DST || left( SRC, POS ) ;  SRC = substr( SRC, POS + 1 )

      POS = verify( x2b( c2x( left( SRC, 1 ))), 1 ) -1

 

      if POS > 1 & POS < 7 then  do /* C0..FD introduce 2-6 bytes */

         TOP = left( SRC, POS )     ;  SRC = substr( SRC, POS + 1 )

         DST = DST || UTF.TOP       /* surrogates implicitly bad, */

      end                           /* C0..C1 are implicitly bad, */

      else  do                      /* 80..BF and FE..FF illegal: */

         DST = DST || UTF.?         ;  SRC = substr( SRC, 2 )

      end                           /* show error character UTF.? */

   end

   return DST || SRC

 

UTF.O:   procedure expose UTF.      /* local charset to UTF-8     */

   parse arg SRC  ;  DST = ''       ;  UTF.8 = UTF.8( arg( 2 ))

 

   do while SRC <> ''

      POS = verify( SRC, UTF.8 ) -1 ;  if POS < 0 then leave

      DST = DST || left( SRC, POS ) ;  SRC = substr( SRC, POS + 1 )

      parse var SRC TOP 2 SRC       ;  DST = DST || UTF.TOP

   end

   return DST || SRC

 

UTF.8:   procedure expose UTF.      /* initialize Unicode table   */

   arg PAGE

   select

      when  PAGE = value( 'UTF..' )                then  nop

      when  PAGE = '' & symbol( 'UTF..' ) = 'VAR'  then  nop

   otherwise

      if symbol( 'UTF.?' ) = 'VAR'  then  T = UTF.?

                                    else  T = x2c( 1A )

      drop UTF.      ;  UTF. = T    /* SUB unknown char.s by 0x1A */

      UTF.. = PAGE   ;  T = ''      /* note actual codepage UTF.. */

      select                        /* -------------------------- */

         when  PAGE =  437 then  do                /*  US OEM DOS */

            T = T '  C7   FC   E9   E2   E4   E0   E5   E7' /* 80 */

            T = T '  EA   EB   E8   EF   EE   EC   C4   C5' /* 88 */

            T = T '  C9   E6   C6   F4   F6   F2   FB   F9' /* 90 */

            T = T '  FF   D6   DC   A2   A3   A5 20A7  192' /* 98 */

            T = T '  E1   ED   F3   FA   F1   D1   AA   BA' /* A0 */

            T = T '  BF 2310   AC   BD   BC   A1   AB   BB' /* A8 */

            T = T '2591 2592 2593 2502 2524 2561 2562 2556' /* B0 */

            T = T '2555 2563 2551 2557 255D 255C 255B 2510' /* B8 */

            T = T '2514 2534 252C 251C 2500 253C 255E 255F' /* C0 */

            T = T '255A 2554 2569 2566 2560 2550 256C 2567' /* C8 */

            T = T '2568 2564 2565 2559 2558 2552 2553 256B' /* D0 */

            T = T '256A 2518 250C 2588 2584 258C 2590 2580' /* D8 */

            T = T ' 3B1   DF  393  3C0  3A3  3C3   B5  3C4' /* E0 */

            T = T ' 3A6  398  3A9  3B4 221E  3C6  3B5 2229' /* E8 */

            T = T '2261   B1 2265 2264 2320 2321   F7 2248' /* F0 */

            T = T '  B0 2219   B7 221A 207F   B2 25A0   A0' /* F8 */

         end                        /* -------------------------- */

         when  PAGE =  858 | PAGE = 850   then do  /* western DOS */

            T = T '  C7   FC   E9   E2   E4   E0   E5   E7' /* 80 */

            T = T '  EA   EB   E8   EF   EE   EC   C4   C5' /* 88 */

            T = T '  C9   E6   C6   F4   F6   F2   FB   F9' /* 90 */

            T = T '  FF   D6   DC   F8   A3   D8   D7  192' /* 98 */

            T = T '  E1   ED   F3   FA   F1   D1   AA   BA' /* A0 */

            T = T '  BF   AE   AC   BD   BC   A1   AB   BB' /* A8 */

            T = T '2591 2592 2593 2502 2524   C1   C2   C0' /* B0 */

            T = T '  A9 2563 2551 2557 255D   A2   A5 2510' /* B8 */

            T = T '2514 2534 252C 251C 2500 253C   E3   C3' /* C0 */

            T = T '255A 2554 2569 2566 2560 2550 256C   A4' /* C8 */

            T = T '  F0   D0   CA   CB   C8 20AC   CD   CE' /* D0 */

            T = T '  CF 2518 250C 2588 2584   A6   CC 2580' /* D8 */

            T = T '  D3   DF   D4   D2   F5   D5   B5   FE' /* E0 */

            T = T '  DE   DA   DB   D9   FD   DD   AF   B4' /* E8 */

            T = T '  AD   B1 2017   BE   B6   A7   F7   B8' /* F0 */

            T = T '  B0   A8   B7   B9   B3   B2 25A0   A0' /* F8 */

            /* 0xD5 850: u+0131 small dotless i, 858: u+20AC Euro */

         end                        /* -------------------------- */

         when  PAGE =  819 | PAGE = 28591 then  do /* ISO 8859-1  */

            do N = 128 to 255 ;  T = T d2x( N ) ;  end N /* 80-FF */

         end                        /* -------------------------- */

         when  PAGE =  923 | PAGE = 28605 then  do /* ISO 8859-15 */

            do N = 128 to 159 ;  T = T d2x( N ) ;  end N /* 80-9F */

            T = T '  A0   A1   A2   A3 20AC   A5  160   A7' /* A0 */

            T = T ' 161   A9   AA   AB   AC   AD   AE   AF' /* A8 */

            T = T '  B0   B1   B2   B3  17D   B5   B6   B7' /* B0 */

            T = T ' 17E   B9   BA   BB  152  153  178   BF' /* B8 */

            do N = 192 to 255 ;  T = T d2x( N ) ;  end N /* C0-FF */

         end                        /* -------------------------- */

         when  PAGE = 1252 | PAGE = 1004  then  do /* OEM Latin-1 */

            T = T '20AC   81 201A  192 201E 2026 2020 2021' /* 80 */

            T = T ' 2C6 2030  160 2039  152   8D  17D   8F' /* 88 */

            T = T '  90 2018 2019 201C 201D 2022 2013 2014' /* 90 */

            T = T ' 2DC 2122  161 203A  153   9D  17E  17F' /* 98 */

            do N = 160 to 255 ;  T = T d2x( N ) ;  end N /* A0-FF */

         end                        /* -------------------------- */

         when  PAGE =  878 then  do /* KOI8-R (ibm-878)           */

            T = T '2500 2502 250C 2510 2514 2518 251C 2524' /* 80 */

            T = T '252C 2534 253C 2580 2584 2588 258C 2590' /* 88 */

            T = T '2591 2592 2593 2320 25A0 2219 221A 2248' /* 90 */

            T = T '2264 2265   A0 2321   B0   B2   B7   F7' /* 98 */

            T = T '2550 2551 2552  451 2553 2554 2555 2556' /* A0 */

            T = T '2557 2558 2559 255A 255B 255C 255D 255E' /* A8 */

            T = T '255F 2560 2561  401 2562 2563 2564 2565' /* B0 */

            T = T '2566 2567 2568 2569 256A 256B 256C   A9' /* B8 */

            T = T ' 44E  430  431  446  434  435  444  433' /* C0 */

            T = T ' 445  438  439  43A  43B  43C  43D  43E' /* C8 */

            T = T ' 43F  44F  440  441  442  443  436  432' /* D0 */

            T = T ' 44C  44B  437  448  44D  449  447  44A' /* D8 */

            T = T ' 42E  410  411  426  414  415  424  413' /* E0 */

            T = T ' 425  418  419  41A  41B  41C  41D  41E' /* E8 */

            T = T ' 41F  42F  420  421  422  423  416  412' /* F0 */

            T = T ' 42C  42B  417  428  42D  429  427  42A' /* F8 */

         end                        /* -------------------------- */

         when PAGE = '10000'  then  do   /* MAC Roman             */

            T = T '  C4   C5   C7   C9   D1   D6   DC   E1' /* 80 */

            T = T '  E0   E2   E4   E3   E5   E7   E9   E8' /* 88 */

            T = T '  EA   EB   ED   EC   EE   EF   F1   F3' /* 90 */

            T = T '  F2   F4   F6   F5   FA   F9   FB   FC' /* 98 */

            T = T '2020   B0   A2   A3   A7 2022   B6   DF' /* A0 */

            T = T '  AE   A9 2122   B4   A8 2260   C6   D8' /* A8 */

            T = T '221E   B1 2264 2265   A5   B5 2202 2211' /* B0 */

            T = T '220F  3C0 222B   AA   BA  3A9   E6   F8' /* B8 */

            T = T '  BF   A1   AC 221A  192 2248 2206   AB' /* C0 */

            T = T '  BB 2026   A0   C0   C3   D5  152  153' /* C8 */

            T = T '2013 2014 201C 201D 2018 2019   F7 25CA' /* D0 */

            T = T '  FF  178 2044 20AC 2039 203A FB01 FB02' /* D8 */

            T = T '2021   B7 201A 201E 2030   C2   CA   C1' /* E0 */

            T = T '  CB   C8   CD   CE   CF   CC   D3   D4' /* E8 */

            T = T 'F8FF   D2   DA   DB   D9  131  2C6  2DC' /* F0 */

            T = T '  AF  2D8  2D9  2DA   B8  2DD  2DB  2C7' /* F8 */

            /* 0xBD old u+2126 Ohm             : new u+03A9 Omega */

            /* 0xDB old u+00A4 currency symbol : new u+20AC Euro  */

            /* 0xF0 old u+2665 black heart suit: new u+F8FF priv. */

         end                        /* -------------------------- */

      end                           /* otherwise force REXX error */

 

      do N = 128 to 255             /* table of UTF-8 characters: */

         parse var T SRC T ;  DST = ''

         SRC = reverse( x2b( SRC )) /* scalar bits right to left  */

 

         do LEN = 2 until verify( substr( SRC, 8 - LEN ), 0 ) = 0

            DST = DST || left( SRC, 6, 0 ) || '01'

            SRC = substr( SRC, 7 )  /* encoded 6 bits of scalar   */

         end LEN                    /* remaining bits of scalar:  */

         DST = DST || left( SRC, 7 - LEN, 0 ) || 0

         DST = x2c( b2x( reverse( DST || copies( 1, LEN ))))

 

         SRC = d2c( N )             /* SRC: 1 byte (local char.)  */

         UTF.DST = SRC              /* DST: 2 or more UTF-8 bytes */

         UTF.SRC = DST              /* excluding us-ascii 0..127  */

      end N

   end

   return xrange( x2c( 0 ), x2c( 7F ))

 

/* see <URL:https://purl.net/xyzzy/rexxtrap.htm>, (c) F. Ellermann */

 

UTIL: procedure                  /* load necessary RexxUtil entry */

   if RxFuncQuery(  arg( 1 )) then

      if RxFuncAdd( arg( 1 ), 'RexxUtil', arg( 1 )) then

         exit TRAP( "can't add RexxUtil"  arg( 1 ))

   return 0

 

TRAP:                            /* select REXX exception handler */

   call trace 'O' ;  trace N           /* don't trace interactive */

   parse source TRAP                   /* source on separate line */

   TRAP = x2c( 0D ) || right( '+++', 10 ) TRAP || x2c( 0D0A )

   TRAP = TRAP || right( '+++', 10 )   /* = standard trace prefix */

   TRAP = TRAP strip( condition( 'c' ) 'trap:' condition( 'd' ))

   select

      when wordpos( condition( 'c' ), 'ERROR FAILURE' ) > 0 then do

         if condition( 'd' ) > ''      /* need an additional line */

            then TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 )

         TRAP = TRAP '(RC' rc || ')'   /* any system error codes  */

         if condition( 'c' ) = 'FAILURE' then rc = -3

      end

      when wordpos( condition( 'c' ), 'HALT SYNTAX'   ) > 0 then do

         if condition( 'c' ) = 'HALT' then rc = 4

         if condition( 'd' ) > '' & condition( 'd' ) <> rc then do

            if condition( 'd' ) <> errortext( rc ) then do

               TRAP = TRAP || x2c( 0D0A ) || right( '+++', 10 )

               TRAP = TRAP errortext( rc )

            end                        /* future condition( 'd' ) */

         end                           /* may use errortext( rc ) */

         else  TRAP = TRAP errortext( rc )

         rc = -rc                      /* rc < 0: REXX error code */

      end

      when condition( 'c' ) = 'NOVALUE'  then rc = -2 /* dubious  */

      when condition( 'c' ) = 'NOTREADY' then rc = -1 /* dubious  */

      otherwise                        /* force non-zero whole rc */

         if datatype( value( 'RC' ), 'W' ) = 0 then rc = 1

         if rc = 0                             then rc = 1

         if condition() = '' then TRAP = TRAP arg( 1 )

   end                                 /* direct: TRAP( message ) */

 

   TRAP = TRAP || x2c( 0D0A ) || format( sigl, 6 )

   signal on syntax name TRAP.SIGL     /* throw syntax error 3... */

   if 0 < sigl & sigl <= sourceline()  /* if no handle for source */

      then TRAP = TRAP '*-*' strip( sourceline( sigl ))

      else TRAP = TRAP '+++ (source line unavailable)'

TRAP.SIGL:                             /* ...catch syntax error 3 */

   if abbrev( right( TRAP, 2 + 6 ), x2c( 0D0A )) then do

      TRAP = TRAP '+++ (source line unreadable)'   ;  rc = -rc

   end

   select

      when 0 then do                   /* in pipes STDERR: output */

         parse version TRAP.REXX       /* REXX/Personal: \dev\con */

         if abbrev( TRAP.REXX, 'REXXSAA ' ) |                /**/ ,

            6 <= word( TRAP.REXX, 2 )  then  TRAP.REXX = 'STDERR'

                                       else  TRAP.REXX = '\dev\con'

         signal on syntax name TRAP.FAIL

         call lineout TRAP.REXX , TRAP /* fails if no more handle */

      end

      when 0 then do                   /* OS/2 PM or ooREXX on NT */

         signal on syntax name TRAP.FAIL

         call RxMessageBox translate( TRAP, ' ', x2c( 0D )), /**/ ,

            'Trap' time(),, 'ERROR'

      end

      otherwise   say TRAP ; trace ?L  /* interactive Label trace */

   end

 

   if condition() = 'SIGNAL' then signal TRAP.EXIT

TRAP.CALL:  return rc                  /* continue after CALL ON  */

TRAP.FAIL:  say TRAP ;  rc = 0 - rc    /* force TRAP error output */

TRAP.EXIT:  exit   rc                  /* exit for any SIGNAL ON  */