Seed7 Program listing


(********************************************************************)
(*                                                                  *)
(*  unicode.s7i   Functions to convert between Unicode encodings.   *)
(*  Copyright (C) 2006, 2008 - 2010, 2014  Thomas Mertes            *)
(*                                                                  *)
(*  This file is part of the Seed7 Runtime Library.                 *)
(*                                                                  *)
(*  The Seed7 Runtime Library is free software; you can             *)
(*  redistribute it and/or modify it under the terms of the GNU     *)
(*  Lesser General Public License as published by the Free Software *)
(*  Foundation; either version 2.1 of the License, or (at your      *)
(*  option) any later version.                                      *)
(*                                                                  *)
(*  The Seed7 Runtime Library is distributed in the hope that it    *)
(*  will be useful, but WITHOUT ANY WARRANTY; without even the      *)
(*  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR *)
(*  PURPOSE.  See the GNU Lesser General Public License for more    *)
(*  details.                                                        *)
(*                                                                  *)
(*  You should have received a copy of the GNU Lesser General       *)
(*  Public License along with this program; if not, write to the    *)
(*  Free Software Foundation, Inc., 51 Franklin Street,             *)
(*  Fifth Floor, Boston, MA  02110-1301, USA.                       *)
(*                                                                  *)
(********************************************************************)


(**
 *  Convert a [[string]] to an UTF-8 encoded string of bytes.
 *   toUtf8("€")          returns "â\130;¬"
 *  Surrogate pairs are converted into a CESU-8 encoded string:
 *   toUtf8("\16#d834;\16#dd1e;")  returns "\237;\160;\180;\237;\180;\158;"  (surrogate pair)
 *  This function accepts unpaired surrogate characters.
 *   toUtf8("\16#dc00;")  returns "\16#ed;\16#b0;\16#80;"  (unpaired surrogate char)
 *  Note that a Unicode string should not contain surrogate characters.
 *  If the string contains surrogate pairs use
 *   toUtf8(replaceUtf16SurrogatePairs(stringWithSurrogatePairs))
 *  to create a correct (not CESU-8 encoded) UTF-8 string.
 *  @param stri Normal (UTF-32) string to be converted to UTF-8.
 *  @return ''stri'' converted to a string of bytes with UTF-8 encoding.
 *)
const func string: toUtf8 (in string: stri)                    is action "STR_TO_UTF8";


(**
 *  Convert a [[string]] with bytes in UTF-8 encoding to UTF-32.
 *   fromUtf8("â\130;¬")                         returns "€"
 *  Surrogate pairs from a CESU-8 encoded string are kept intact:
 *   fromUtf8("\237;\160;\180;\237;\180;\158;")  returns "\16#d834;\16#dd1e;" (surrogate pair)
 *  To decode a CESU-8 encoded string use:
 *   replaceUtf16SurrogatePairs(fromUtf8(cesu8String))
 *  Overlong encodings and unpaired surrogate chare are accepted.
 *   fromUtf8("\16#c0;\16#80;")                  returns "\0;"        (overlong encoding)
 *   fromUtf8("\16#ed;\16#b0;\16#80;")           returns "\16#dc00;"  (unpaired surrogate char)
 *  @param utf8 String of bytes encoded with UTF-8.
 *  @return ''utf8'' converted to a normal (UTF-32) [[string]].
 *  @exception RANGE_ERROR If ''utf8'' contains a char beyond '\255;' or
 *                         if ''utf8'' is not encoded with UTF-8.
 *)
const func string: fromUtf8 (in string: utf8)                  is action "STR_FROM_UTF8";


(**
 *  Convert a [[string]] to an UTF-16BE encoded string of bytes.
 *  @param stri Normal (UTF-32) string to be converted to UTF-16BE.
 *  @return ''stri'' converted to a string of bytes with UTF-16BE encoding.
 *  @exception RANGE_ERROR If a character is not representable as UTF-16 or
 *                         a surrogate character is present.
 *)
const func string: toUtf16Be (in string: stri) is func
  result
    var string: utf16Be is "";
  local
    var char: ch is ' ';
    var integer: ch1 is 0;
    var integer: ch2 is 0;
  begin
    for ch range stri do
      if ch <= '\16#ffff;' then
        if ch >= '\16#d800;' and ch <= '\16#dfff;' then
          raise RANGE_ERROR;
        else
          utf16Be &:= chr((ord(ch) >> 8) mod 256);
          utf16Be &:= chr( ord(ch)       mod 256);
        end if;
      elsif ch <= '\16#10ffff;' then
        ch1 := 16#d800 + ((ord(ch) - 16#10000) >> 10);
        ch2 := 16#dc00 +  (ord(ch) - 16#10000) mod 16#400;
        utf16Be &:= chr((ch1 >> 8) mod 256);
        utf16Be &:= chr( ch1       mod 256);
        utf16Be &:= chr((ch2 >> 8) mod 256);
        utf16Be &:= chr( ch2       mod 256);
      else
        raise RANGE_ERROR;
      end if;
    end for;
  end func;


(**
 *  Convert an UTF-16BE encoded [[string]] of bytes to UTF-32.
 *  @param utf16Be String of bytes encoded with UTF-16 in
 *         big endian byte order.
 *  @return ''utf16Be'' converted to a normal (UTF-32) [[string]].
 *  @exception RANGE_ERROR If the length of ''utf16Be'' is odd or
 *                         if ''utf16Be'' contains a char beyond '\255;' or
 *                         if ''utf16Be'' contains an invalid surrogate pair.
 *)
const func string: fromUtf16Be (in string: utf16Be) is func
  result
    var string: stri is "";
  local
    var integer: index is 0;
    var char: byte1 is ' ';
    var char: byte2 is ' ';
    var char: ch1 is ' ';
    var char: ch2 is ' ';
  begin
    if odd(length(utf16Be)) then
      raise RANGE_ERROR;
    end if;
    for index range 1 to length(utf16Be) step 2 do
      byte1 := utf16Be[index];
      byte2 := utf16Be[succ(index)];
      if byte1 > '\255;' or byte2 > '\255;' then
        raise RANGE_ERROR;
      else
        ch1 := chr(ord(byte1) * 256 + ord(byte2));
        if ch1 >= '\16#d800;' and ch1 <= '\16#dfff;' then
          if ch1 <= '\16#dbff;' and index < length(utf16Be) - 2 then
            index +:= 2;
            byte1 := utf16Be[index];
            byte2 := utf16Be[succ(index)];
            if byte1 > '\255;' or byte2 > '\255;' then
              raise RANGE_ERROR;
            else
              ch2 := chr(ord(byte1) * 256 + ord(byte2));
              if ch2 >= '\16#dc00;' and ch2 <= '\16#dfff;' then
                stri &:= chr(((ord(ch1) - 16#d800) << 10) +
                              (ord(ch2) - 16#dc00) + 16#10000);
              else
                raise RANGE_ERROR;
              end if;
            end if;
          else
            raise RANGE_ERROR;
          end if;
        else
          stri &:= ch1;
        end if;
      end if;
    end for;
  end func;


(**
 *  Convert a [[string]] to an UTF-16LE encoded string of bytes.
 *  @param stri Normal (UTF-32) string to be converted to UTF-16LE.
 *  @return ''stri'' converted to a string of bytes with UTF-16LE encoding.
 *  @exception RANGE_ERROR If a character is not representable as UTF-16 or
 *                         a surrogate character is present.
 *)
const func string: toUtf16Le (in string: stri) is func
  result
    var string: utf16Le is "";
  local
    var char: ch is ' ';
    var integer: ch1 is 0;
    var integer: ch2 is 0;
  begin
    for ch range stri do
      if ch <= '\16#ffff;' then
        if ch >= '\16#d800;' and ch <= '\16#dfff;' then
          raise RANGE_ERROR;
        else
          utf16Le &:= chr( ord(ch)       mod 256);
          utf16Le &:= chr((ord(ch) >> 8) mod 256);
        end if;
      elsif ch <= '\16#10ffff;' then
        ch1 := 16#d800 + ((ord(ch) - 16#10000) >> 10);
        ch2 := 16#dc00 +  (ord(ch) - 16#10000) mod 16#400;
        utf16Le &:= chr( ch1       mod 256);
        utf16Le &:= chr((ch1 >> 8) mod 256);
        utf16Le &:= chr( ch2       mod 256);
        utf16Le &:= chr((ch2 >> 8) mod 256);
      else
        raise RANGE_ERROR;
      end if;
    end for;
  end func;


(**
 *  Convert an UTF-16LE encoded [[string]] of bytes to UTF-32.
 *  @param utf16Le String of bytes encoded with UTF-16 in
 *         little endian byte order.
 *  @return ''utf16Le'' converted to a normal (UTF-32) [[string]].
 *  @exception RANGE_ERROR If the length of ''utf16Le'' is odd or
 *                         if ''utf16Le'' contains a char beyond '\255;' or
 *                         if ''utf16Le'' contains an invalid surrogate pair.
 *)
const func string: fromUtf16Le (in string: utf16Le) is func
  result
    var string: stri is "";
  local
    var integer: index is 0;
    var char: byte1 is ' ';
    var char: byte2 is ' ';
    var char: ch1 is ' ';
    var char: ch2 is ' ';
  begin
    if odd(length(utf16Le)) then
      raise RANGE_ERROR;
    end if;
    for index range 1 to length(utf16Le) step 2 do
      byte1 := utf16Le[index];
      byte2 := utf16Le[succ(index)];
      if byte1 > '\255;' or byte2 > '\255;' then
        raise RANGE_ERROR;
      else
        ch1 := chr(ord(byte2) * 256 + ord(byte1));
        if ch1 >= '\16#d800;' and ch1 <= '\16#dfff;' then
          if ch1 <= '\16#dbff;' and index < length(utf16Le) - 2 then
            index +:= 2;
            byte1 := utf16Le[index];
            byte2 := utf16Le[succ(index)];
            if byte1 > '\255;' or byte2 > '\255;' then
              raise RANGE_ERROR;
            else
              ch2 := chr(ord(byte2) * 256 + ord(byte1));
              if ch2 >= '\16#dc00;' and ch2 <= '\16#dfff;' then
                stri &:= chr(((ord(ch1) - 16#d800) << 10) +
                              (ord(ch2) - 16#dc00) + 16#10000);
              else
                raise RANGE_ERROR;
              end if;
            end if;
          else
            raise RANGE_ERROR;
          end if;
        else
          stri &:= ch1;
        end if;
      end if;
    end for;
  end func;


(**
 *  Return [[string]] where all surrogate pairs are replaced by single chars.
 *   replaceUtf16SurrogatePairs("\16#d834;\16#dd1e;")  returns "\16#1d11e;"
 *  This function can be used to decode CESU-8 encoded strings:
 *   replaceUtf16SurrogatePairs(fromUtf8(cesu8String))
 *  In CESU-8 an Unicode code point from the Basic Multilingual Plane (BMP) is
 *  encoded in the same way as in UTF-8. An Unicode code point outside the BMP
 *  is first represented as a surrogate pair, like in UTF-16, and then each
 *  surrogate code point is encoded in UTF-8.
 *  @param stri String of UTF-16 or UTF-32 Unicode characters,
 *         which may contain surrogate pairs.
 *  @return ''stri'' with all surrogate pairs replaced by single UTF-32 chars.
 *  @exception RANGE_ERROR If an invalid surrogate pair is present.
 *)
const func string: replaceUtf16SurrogatePairs (in string: stri) is func
  result
    var string: resultStri is "";
  local
    var integer: index is 0;
    var integer: startPos is 1;
    var char: ch1 is ' ';
    var char: ch2 is ' ';
  begin
    for index range 1 to length(stri) do
      ch1 := stri[index];
      if ch1 >= '\16#d800;' and ch1 <= '\16#dfff;' then
        if ch1 <= '\16#dbff;' and index < length(stri) then
          ch2 := stri[succ(index)];
          if ch2 >= '\16#dc00;' and ch2 <= '\16#dfff;' then
            resultStri &:= stri[startPos .. pred(index)];
            resultStri &:= chr(((ord(ch1) - 16#d800) << 10) +
                                (ord(ch2) - 16#dc00) + 16#10000);
            incr(index);
            startPos := succ(index);
          else
            raise RANGE_ERROR;
          end if;
        else
          raise RANGE_ERROR;
        end if;
      end if;
    end for;
    resultStri &:= stri[startPos ..];
  end func;


(**
 *  Convert a null terminated UTF-16BE encoded string of bytes to UTF-32.
 *  The UTF-16BE encoded string starts at ''startPos'' and ends with an
 *  UTF-16BE encoded null ('\0;') character. When there is no null character
 *  the UTF-16BE encoded string is assumed to extend to the end of ''stri''.
 *  @param stri UTF-16BE encoded string of bytes (starting from ''startPos'').
 *  @param startPos Start position for the UTF-16BE encoded null terminated string.
 *  @return the string found in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16BE to UTF-32 fails.
 *)
const func string: fromNullTerminatedUtf16Be (in string: stri, in integer: startPos) is func
  result
    var string: resultStri is "";
  local
    var integer: pos is 0;
  begin
    pos := startPos;
    while pos < length(stri) and (stri[pos] <> '\0;' or stri[succ(pos)] <> '\0;') do
      pos +:= 2;
    end while;
    if pos >= length(stri) then
      resultStri := fromUtf16Be(stri[startPos ..]);
    else
      resultStri := fromUtf16Be(stri[startPos .. pred(pos)]);
    end if;
  end func;


(**
 *  Convert a null terminated UTF-16LE encoded string of bytes to UTF-32.
 *  The UTF-16LE encoded string starts at ''startPos'' and ends with an
 *  UTF-16LE encoded null ('\0;') character. When there is no null character
 *  the UTF-16LE encoded string is assumed to extend to the end of ''stri''.
 *  @param stri UTF-16LE encoded string of bytes (starting from ''startPos'').
 *  @param startPos Start position for the UTF-16LE encoded null terminated string.
 *  @return the string found in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16LE to UTF-32 fails.
 *)
const func string: fromNullTerminatedUtf16Le (in string: stri, in integer: startPos) is func
  result
    var string: resultStri is "";
  local
    var integer: pos is 0;
  begin
    pos := startPos;
    while pos < length(stri) and (stri[pos] <> '\0;' or stri[succ(pos)] <> '\0;') do
      pos +:= 2;
    end while;
    if pos >= length(stri) then
      resultStri := fromUtf16Le(stri[startPos ..]);
    else
      resultStri := fromUtf16Le(stri[startPos .. pred(pos)]);
    end if;
  end func;


(**
 *  Read a null terminated UTF-16BE encoded string of bytes and convert it to UTF-32.
 *  The UTF-16BE encoded string starts at ''currPos'' and ends with an
 *  UTF-16BE encoded null ('\0;') character. The position ''currPos'' is
 *  advanced behind the null ('\0;') character. When there is no null character
 *  the UTF-16BE encoded string is assumed to extend to the end of ''stri''.
 *  In this case ''currPos'' is advanced beyond the length of ''stri''.
 *  @param stri UTF-16BE encoded string of bytes (starting from ''currPos'').
 *  @param currPos Start position for the UTF-16BE encoded null terminated string.
 *                 The function advances ''currPos'' to refer to the position
 *                 behind the terminating null ('\0;') character.
 *  @return the string found in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16BE to UTF-32 fails.
 *)
const func string: getNullTerminatedUtf16Be (in string: stri, inout integer: currPos) is func
  result
    var string: resultStri is "";
  local
    var integer: pos is 0;
  begin
    pos := currPos;
    while pos < length(stri) and (stri[pos] <> '\0;' or stri[succ(pos)] <> '\0;') do
      pos +:= 2;
    end while;
    if pos >= length(stri) then
      resultStri := fromUtf16Be(stri[currPos ..]);
      currPos := succ(length(stri));
    else
      resultStri := fromUtf16Be(stri[currPos .. pred(pos)]);
      currPos := pos + 2;
    end if;
  end func;


(**
 *  Read a null terminated UTF-16BE encoded string of bytes and convert it to UTF-32.
 *  The reading ends when an UTF-16BE encoded null ('\0;') character has been read.
 *  @param inFile File with UTF-16BE encoded bytes.
 *  @return the string read in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16BE to UTF-32 fails.
 *)
const func string: getNullTerminatedUtf16Be (inout file: inFile) is func
  result
    var string: resultStri is "";
  local
    var char: byte1 is ' ';
    var char: byte2 is ' ';
    var char: ch is ' ';
  begin
    byte1 := getc(inFile);
    byte2 := getc(inFile);
    if byte1 > '\255;' or byte2 > '\255;' then
      raise RANGE_ERROR;
    else
      ch := chr(ord(byte1) * 256 + ord(byte2));
      while ch <> '\0;' do
        resultStri &:= ch;
        byte1 := getc(inFile);
        byte2 := getc(inFile);
        if byte1 > '\255;' or byte2 > '\255;' then
          raise RANGE_ERROR;
        else
          ch := chr(ord(byte1) * 256 + ord(byte2));
        end if;
      end while;
    end if;
    resultStri := replaceUtf16SurrogatePairs(resultStri);
  end func;


(**
 *  Read a null terminated UTF-16LE encoded string of bytes and convert it to UTF-32.
 *  The UTF-16LE encoded string starts at ''currPos'' and ends with an
 *  UTF-16LE encoded null ('\0;') character. The position ''currPos'' is
 *  advanced behind the null ('\0;') character. When there is no null character
 *  the UTF-16LE encoded string is assumed to extend to the end of ''stri''.
 *  In this case ''currPos'' is advanced beyond the length of ''stri''.
 *  @param stri UTF-16LE encoded string of bytes (starting from ''currPos'').
 *  @param currPos Start position for the UTF-16LE encoded null terminated string.
 *                 The function advances ''currPos'' to refer to the position
 *                 behind the terminating null ('\0;') character.
 *  @return the string found in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16LE to UTF-32 fails.
 *)
const func string: getNullTerminatedUtf16Le (in string: stri, inout integer: currPos) is func
  result
    var string: resultStri is "";
  local
    var integer: pos is 0;
  begin
    pos := currPos;
    while pos < length(stri) and (stri[pos] <> '\0;' or stri[succ(pos)] <> '\0;') do
      pos +:= 2;
    end while;
    if pos >= length(stri) then
      resultStri := fromUtf16Le(stri[currPos ..]);
      currPos := succ(length(stri));
    else
      resultStri := fromUtf16Le(stri[currPos .. pred(pos)]);
      currPos := pos + 2;
    end if;
  end func;


(**
 *  Read a null terminated UTF-16LE encoded string of bytes and convert it to UTF-32.
 *  The reading ends when an UTF-16LE encoded null ('\0;') character has been read.
 *  @param inFile File with UTF-16LE encoded bytes.
 *  @return the string read in UTF-32 encoding without the null ('\0;') character.
 *  @exception RANGE_ERROR If the conversion from UTF-16LE to UTF-32 fails.
 *)
const func string: getNullTerminatedUtf16Le (inout file: inFile) is func
  result
    var string: resultStri is "";
  local
    var char: byte1 is ' ';
    var char: byte2 is ' ';
    var char: ch is ' ';
  begin
    byte1 := getc(inFile);
    byte2 := getc(inFile);
    if byte1 > '\255;' or byte2 > '\255;' then
      raise RANGE_ERROR;
    else
      ch := chr(ord(byte2) * 256 + ord(byte1));
      while ch <> '\0;' do
        resultStri &:= ch;
        byte1 := getc(inFile);
        byte2 := getc(inFile);
        if byte1 > '\255;' or byte2 > '\255;' then
          raise RANGE_ERROR;
        else
          ch := chr(ord(byte2) * 256 + ord(byte1));
        end if;
      end while;
    end if;
    resultStri := replaceUtf16SurrogatePairs(resultStri);
  end func;


(**
 *  Convert a [[string]] from an UTF-7 encoding to UTF-32.
 *  @param stri7 String of bytes encoded with UTF-7.
 *  @return ''stri7'' converted a to normal (UTF-32) [[string]].
 *  @exception RANGE_ERROR The [[string]] is not UTF-7 encoded.
 *)
const func string: fromUtf7 (in string: stri7) is func
  result
    var string: resultStri is "";
  local
    const array integer: decode is [] (                      # -1 is invalid
        62, -1, -1, -1, 63,                                  # + /
        52, 53, 54, 55, 56, 57, 58, 59, 60, 61,              # 0 - 9
        -1, -1, -1, -1, -1, -1, -1,                          # =
         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12,  # A - M
        13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,  # N - Z
        -1, -1, -1, -1, -1, -1,
        26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,  # a - m
        39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51); # n - z
    var integer: startPos is 1;
    var integer: plusPos is 0;
    var integer: minusPos is 0;
    var integer: index is 0;
    var char: ch is ' ';
    var integer: charNum is 0;
    var integer: accumulator is 0;
    var integer: numBits is 0;
    var integer: number is 0;
    var boolean: okay is FALSE;
    var string: unicodeStri is "";
  begin
    plusPos := pos(stri7, "+");
    while plusPos <> 0 do
      resultStri &:= stri7[startPos .. pred(plusPos)];
      minusPos := pos(stri7, "-", succ(plusPos));
      if minusPos = 0 then
        resultStri &:= "+";
        minusPos := plusPos;
      elsif minusPos = succ(plusPos) then
        resultStri &:= "+";
      else
        okay := TRUE;
        unicodeStri := "";
        accumulator := 0;
        numBits := 0;
        for index range succ(plusPos) to pred(minusPos) do
          ch := stri7[index];
          if ch >= '+' and ch <= 'z' then
            number := decode[ord(stri7[index]) - ord(pred('+'))];
            if number >= 0 then
              accumulator := (accumulator << 6) + number;
              numBits +:= 6;
              if numBits >= 16 then
                numBits -:= 16;
                charNum := accumulator >> numBits;
                accumulator -:= charNum << numBits;
                unicodeStri &:= chr(charNum);
              end if;
            else
              okay := FALSE;
              index := minusPos;
            end if;
          else
            okay := FALSE;
            index := minusPos;
          end if;
        end for;
        if okay then
          if accumulator <> 0 then
            raise RANGE_ERROR;
          else
            resultStri &:= replaceUtf16SurrogatePairs(unicodeStri);
          end if;
        else
          resultStri &:= "+";
          minusPos := plusPos;
        end if;
      end if;
      startPos := succ(minusPos);
      plusPos := pos(stri7, "+", startPos);
    end while;
    resultStri &:= stri7[startPos ..];
  end func;