(********************************************************************) (* *) (* utf16.s7i File implementation type for UTF-16 files *) (* Copyright (C) 2009 Thomas Mertes *) (* *) (* This file is part of the Seed7 Runtime Library. *) (* *) (* The Seed7 Runtime Library is free software; you can *) (* redistribute it and/or modify it under the terms of the GNU *) (* Lesser General Public License as published by the Free Software *) (* Foundation; either version 2.1 of the License, or (at your *) (* option) any later version. *) (* *) (* The Seed7 Runtime Library is distributed in the hope that it *) (* will be useful, but WITHOUT ANY WARRANTY; without even the *) (* implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR *) (* PURPOSE. See the GNU Lesser General Public License for more *) (* details. *) (* *) (* You should have received a copy of the GNU Lesser General *) (* Public License along with this program; if not, write to the *) (* Free Software Foundation, Inc., 51 Franklin Street, *) (* Fifth Floor, Boston, MA 02110-1301, USA. *) (* *) (********************************************************************) include "null_file.s7i"; (** * [[file|File]] implementation type for UTF-16 files. * This type supports UTF-16 encoded sequential files of the * operating system. UTF-16 files are seekable, therefore they * support the functions [[#length(in_utf16File)|length]], * [[#seek(in_utf16File,in_integer)|seek]] and * [[#tell(in_utf16File)|tell]]. *) const type: utf16File is sub null_file struct var clib_file: ext_file is CLIB_NULL_FILE; var string: name is ""; end struct; (** * Close an utf16File. * @exception FILE_ERROR A system function returns an error. *) const proc: close (in utf16File: aFile) is func begin close(aFile.ext_file); end func; (** * Forces that all buffered data of ''outFile'' is sent to its destination. * This causes data to be sent to the operating systems file system. *) const proc: flush (in utf16File: outFile) is func begin flush(outFile.ext_file); end func; (** * Determine the end-of-file indicator. * The end-of-file indicator is set if at least one request to read * from the file failed. * @return TRUE if the end-of-file indicator is set, FALSE otherwise. *) const func boolean: eof (in utf16File: inFile) is return eof(inFile.ext_file); (** * Determine if at least one character can be read successfully. * This function allows a file to be handled like an iterator. * @return FALSE if 'getc' would return [[char#EOF|EOF]], TRUE otherwise. *) const func boolean: hasNext (in utf16File: inFile) is return hasNext(inFile.ext_file); (** * Obtain the length of a file. * The file length is measured in bytes. * @return the size of the given file. * @exception RANGE_ERROR The file length does not fit into * an integer value. * @exception FILE_ERROR A system function returns an error or the * file length reported by the system is negative. *) const func integer: length (in utf16File: aFile) is return length(aFile.ext_file); (** * Truncate ''aFile'' to the given ''length''. * If the file previously was larger than ''length'', the extra data is lost. * If the file previously was shorter, it is extended, and the extended * part is filled with null bytes ('\0;'). * @param aFile File to be truncated. * @param length Requested length of ''aFile'' in bytes. * @exception RANGE_ERROR The requested length is negative or * the length is not representable in the type * used by the system function. * @exception FILE_ERROR A system function returns an error. *) const proc: truncate (in utf16File: aFile, in integer: length) is func begin truncate(aFile.ext_file, length); end func; (** * Determine if the file ''aFile'' is seekable. * If a file is seekable the functions ''seek'' and ''tell'' * can be used to set and and obtain the current file position. * @return TRUE, if ''aFile'' is seekable, FALSE otherwise. *) const func boolean: seekable (in utf16File: aFile) is return seekable(aFile.ext_file); (** * Set the current file position. * The file position is measured in bytes from the start of the file. * The first byte in the file has the position 1. * @exception RANGE_ERROR The file position is negative or zero. * @exception FILE_ERROR A system function returns an error. *) const proc: seek (in utf16File: aFile, in integer: position) is func begin seek(aFile.ext_file, position); end func; (** * Obtain the current file position. * The file position is measured in bytes from the start of the file. * The first byte in the file has the position 1. * @return the current file position. * @exception RANGE_ERROR The file position does not fit into * an integer value. * @exception FILE_ERROR A system function returns an error or the * file position reported by the system is negative. *) const func integer: tell (in utf16File: aFile) is return tell(aFile.ext_file); (** * [[file|File]] implementation type for UTF-16LE (little endian) files. * This type supports UTF-16 encoded sequential files of the * operating system. UTF-16 files are seekable, therefore they * support the functions [[#length(in_utf16File)|length]], * [[#seek(in_utf16File,in_integer)|seek]] and * [[#tell(in_utf16File)|tell]]. *) const type: utf16leFile is sub utf16File struct end struct; (** * Opens an Unicode file which uses the UTF-16LE encoding. * The file is opened with the specified ''path'' and ''mode''. * If the file is opened with one of the modes "w", "w+", "wt" or * "wt+" an appropriate BOM is created. If the file is opened * with a any other mode the application program is in charge to * handle optional BOM markers. This way 'openUtf16le' can be used * to open existing files without BOM. * There are text modes and binary modes: * *Binary modes: * ** "r" Open file for reading. * ** "w" Truncate to zero length or create file for writing. * ** "a" Append; open or create file for writing at end-of-file. * ** "r+" Open file for update (reading and writing). * ** "w+" Truncate to zero length or create file for update. * ** "a+" Append; open or create file for update, writing at end-of-file. * *Text modes: * ** "rt" Open file for reading. * ** "wt" Truncate to zero length or create file for writing. * ** "at" Append; open or create file for writing at end-of-file. * ** "rt+" Open file for update (reading and writing). * ** "wt+" Truncate to zero length or create file for update. * ** "at+" Append; open or create file for update, writing at end-of-file. * Note that this modes differ from the ones used by the C function * fopen(). * @param path Path of the file to be opened. The path must * use the standard path representation. * @param mode Mode of the file to be opened. * @return the file opened, or [[null_file#STD_NULL|STD_NULL]] * if it could not be opened or if ''path'' refers to * a directory. * @exception MEMORY_ERROR Not enough memory to convert the path * to the system path type. * @exception RANGE_ERROR The ''mode'' is not one of the allowed * values or ''path'' does not use the standard path * representation or ''path'' cannot be converted * to the system path type. *) const func file: openUtf16le (in string: path, in string: mode) is func result var file: newFile is STD_NULL; local var clib_file: open_file is CLIB_NULL_FILE; var utf16leFile: new_le_file is utf16leFile.value; begin open_file := openClibFile(path, mode); if open_file <> CLIB_NULL_FILE then if mode[1 len 1] = "w" then write(open_file, "\16#ff;\16#fe;"); end if; new_le_file.ext_file := open_file; new_le_file.name := path; newFile := toInterface(new_le_file); end if; end func; (** * Write a string to an UTF-16LE file. * @exception RANGE_ERROR A character is not representable with UTF-16. * @exception FILE_ERROR The system function returns an error. *) const proc: write (in utf16leFile: outFile, in string: stri) is func local var char: ch is ' '; var integer: number is 0; var char: ch1 is ' '; var char: ch2 is ' '; begin for ch range stri do if ch <= '\16#ffff;' then write(outFile.ext_file, str(chr(ord(ch) mod 256))); write(outFile.ext_file, str(chr(ord(ch) mdiv 256))); elsif ch <= '\16#10ffff;' then number := ord(ch) - 16#10000; ch1 := chr(16#d800 + (number >> 10)); write(outFile.ext_file, str(chr(ord(ch1) mod 256))); write(outFile.ext_file, str(chr(ord(ch1) mdiv 256))); ch2 := chr(16#dc00 + (number mod 16#400)); write(outFile.ext_file, str(chr(ord(ch2) mod 256))); write(outFile.ext_file, str(chr(ord(ch2) mdiv 256))); else raise RANGE_ERROR; end if; end for; end func; (** * Return a string read with a maximum length from an UTF-16LE file. * @return the string read. * @exception RANGE_ERROR The parameter ''maxLength'' is negative, or * the file contains an invalid surrogate pair. *) const func string: gets (in utf16leFile: inFile, in integer: maxLength) is func result var string: striRead is ""; local var integer: charsMissing is 0; var boolean: partialRead is FALSE; var string: stri is ""; var string: surrogate_part is ""; var integer: index is 0; var char: ch1 is ' '; var char: ch2 is ' '; begin if maxLength <= 0 then if maxLength <> 0 then raise RANGE_ERROR; end if; else charsMissing := maxLength; repeat stri := gets(inFile.ext_file, 2 * charsMissing); if odd(length(stri)) then raise RANGE_ERROR; elsif length(stri) <> 2 * charsMissing then partialRead := TRUE; end if; for index range 1 to length(stri) mdiv 2 do ch1 := chr(ord(stri[index * 2]) * 256 + ord(stri[pred(index * 2)])); if ch1 >= '\16#d800;' and ch1 <= '\16#dbff;' then # ch1 introduces a surrogate pair if index = length(stri) mdiv 2 then surrogate_part := gets(inFile.ext_file, 2); if length(surrogate_part) = 2 then ch2 := chr(ord(surrogate_part[2]) * 256 + ord(surrogate_part[1])); else raise RANGE_ERROR; end if; else incr(index); ch2 := chr(ord(stri[index * 2]) * 256 + ord(stri[pred(index * 2)])); end if; if ch2 >= '\16#dc00;' and ch2 <= '\16#dfff;' then # ch1 and ch2 are a surrogate pair striRead &:= chr((ord(ch1) - 16#d800) << 10 + (ord(ch2) - 16#dc00) + 16#10000); else raise RANGE_ERROR; end if; else striRead &:= str(ch1); end if; end for; charsMissing := maxLength - length(striRead); until charsMissing = 0 or partialRead; end if; end func; (** * [[file|File]] implementation type for UTF-16BE (big endian) files. * This type supports UTF-16 encoded sequential files of the * operating system. UTF-16 files are seekable, therefore they * support the functions [[#length(in_utf16File)|length]], * [[#seek(in_utf16File,in_integer)|seek]] and * [[#tell(in_utf16File)|tell]]. *) const type: utf16beFile is sub utf16File struct end struct; (** * Opens an Unicode file which uses the UTF-16BE encoding. * The file is opened with the specified ''path'' and ''mode''. * If the file is opened with one of the modes "w", "w+", "wt" or * "wt+" an appropriate BOM is created. If the file is opened * with a any other mode the application program is in charge to * handle optional BOM markers. This way 'openUtf16be' can be used * to open existing files without BOM. * There are text modes and binary modes: * *Binary modes: * ** "r" Open file for reading. * ** "w" Truncate to zero length or create file for writing. * ** "a" Append; open or create file for writing at end-of-file. * ** "r+" Open file for update (reading and writing). * ** "w+" Truncate to zero length or create file for update. * ** "a+" Append; open or create file for update, writing at end-of-file. * *Text modes: * ** "rt" Open file for reading. * ** "wt" Truncate to zero length or create file for writing. * ** "at" Append; open or create file for writing at end-of-file. * ** "rt+" Open file for update (reading and writing). * ** "wt+" Truncate to zero length or create file for update. * ** "at+" Append; open or create file for update, writing at end-of-file. * Note that this modes differ from the ones used by the C function * fopen(). * @param path Path of the file to be opened. The path must * use the standard path representation. * @param mode Mode of the file to be opened. * @return the file opened, or [[null_file#STD_NULL|STD_NULL]] * if it could not be opened or if ''path'' refers to * a directory. * @exception MEMORY_ERROR Not enough memory to convert the path * to the system path type. * @exception RANGE_ERROR The ''mode'' is not one of the allowed * values or ''path'' does not use the standard path * representation or ''path'' cannot be converted * to the system path type. *) const func file: openUtf16be (in string: path, in string: mode) is func result var file: newFile is STD_NULL; local var clib_file: open_file is CLIB_NULL_FILE; var utf16beFile: new_be_file is utf16beFile.value; begin open_file := openClibFile(path, mode); if open_file <> CLIB_NULL_FILE then if mode[1 len 1] = "w" then write(open_file, "\16#fe;\16#ff;"); end if; new_be_file.ext_file := open_file; new_be_file.name := path; newFile := toInterface(new_be_file); end if; end func; (** * Write a string to an UTF-16BE file. * @exception RANGE_ERROR If a character is not representable with UTF-16. * @exception FILE_ERROR The system function returns an error. *) const proc: write (in utf16beFile: outFile, in string: stri) is func local var char: ch is ' '; var integer: number is 0; var char: ch1 is ' '; var char: ch2 is ' '; begin for ch range stri do if ch <= '\16#ffff;' then write(outFile.ext_file, str(chr(ord(ch) mdiv 256))); write(outFile.ext_file, str(chr(ord(ch) mod 256))); elsif ch <= '\16#10ffff;' then number := ord(ch) - 16#10000; ch1 := chr(16#d800 + (number >> 10)); write(outFile.ext_file, str(chr(ord(ch1) mdiv 256))); write(outFile.ext_file, str(chr(ord(ch1) mod 256))); ch2 := chr(16#dc00 + (number mod 16#400)); write(outFile.ext_file, str(chr(ord(ch2) mdiv 256))); write(outFile.ext_file, str(chr(ord(ch2) mod 256))); else raise RANGE_ERROR; end if; end for; end func; (** * Return a string read with a maximum length from an UTF-16BE file. * @return the string read. * @exception RANGE_ERROR The parameter ''maxLength'' is negative, or * the file contains an invalid surrogate pair. *) const func string: gets (in utf16beFile: inFile, in integer: maxLength) is func result var string: striRead is ""; local var integer: charsMissing is 0; var boolean: partialRead is FALSE; var string: stri is ""; var string: surrogate_part is ""; var integer: index is 0; var char: ch1 is ' '; var char: ch2 is ' '; begin if maxLength <= 0 then if maxLength <> 0 then raise RANGE_ERROR; end if; else charsMissing := maxLength; repeat stri := gets(inFile.ext_file, 2 * charsMissing); if odd(length(stri)) then raise RANGE_ERROR; elsif length(stri) <> 2 * charsMissing then partialRead := TRUE; end if; for index range 1 to length(stri) mdiv 2 do ch1 := chr(ord(stri[pred(index * 2)]) * 256 + ord(stri[index * 2])); if ch1 >= '\16#d800;' and ch1 <= '\16#dbff;' then # ch1 introduces a surrogate pair if index = length(stri) mdiv 2 then surrogate_part := gets(inFile.ext_file, 2); if length(surrogate_part) = 2 then ch2 := chr(ord(surrogate_part[1]) * 256 + ord(surrogate_part[2])); else raise RANGE_ERROR; end if; else incr(index); ch2 := chr(ord(stri[pred(index * 2)]) * 256 + ord(stri[index * 2])); end if; if ch2 >= '\16#dc00;' and ch2 <= '\16#dfff;' then # ch1 and ch2 are a surrogate pair striRead &:= chr((ord(ch1) - 16#d800) << 10 + (ord(ch2) - 16#dc00) + 16#10000); else raise RANGE_ERROR; end if; else striRead &:= str(ch1); end if; end for; charsMissing := maxLength - length(striRead); until charsMissing = 0 or partialRead; end if; end func; (** * Opens an Unicode file which uses the UTF-16LE or UTF-16BE encoding. * The file is opened with the specified ''path'' and ''mode''. * The function 'openUtf16' checks for a BOM and depending on that * opens an UTF-16LE or UTF-16BE file. If the file contains no BOM * the function returns [[null_file#STD_NULL|STD_NULL]]. * There are text modes and binary modes: * *Binary modes: * ** "r" Open file for reading. * ** "w" Truncate to zero length or create file for writing. * ** "a" Append; open or create file for writing at end-of-file. * ** "r+" Open file for update (reading and writing). * ** "w+" Truncate to zero length or create file for update. * ** "a+" Append; open or create file for update, writing at end-of-file. * *Text modes: * ** "rt" Open file for reading. * ** "wt" Truncate to zero length or create file for writing. * ** "at" Append; open or create file for writing at end-of-file. * ** "rt+" Open file for update (reading and writing). * ** "wt+" Truncate to zero length or create file for update. * ** "at+" Append; open or create file for update, writing at end-of-file. * Note that this modes differ from the ones used by the C function * fopen(). * @param path Path of the file to be opened. The path must * use the standard path representation. * @param mode Mode of the file to be opened. * @return the file opened, or [[null_file#STD_NULL|STD_NULL]] * if it could not be opened or if ''path'' refers to * a directory. * @exception MEMORY_ERROR Not enough memory to convert the path * to the system path type. * @exception RANGE_ERROR The ''mode'' is not one of the allowed * values or ''path'' does not use the standard path * representation or ''path'' cannot be converted * to the system path type. *) const func file: openUtf16 (in string: path, in string: mode) is func result var file: newFile is STD_NULL; local var clib_file: open_file is CLIB_NULL_FILE; var utf16leFile: new_le_file is utf16leFile.value; var utf16beFile: new_be_file is utf16beFile.value; var string: bom is ""; begin open_file := openClibFile(path, mode); if open_file <> CLIB_NULL_FILE then bom := gets(open_file, 2); if bom = "\16#ff;\16#fe;" then new_le_file.ext_file := open_file; new_le_file.name := path; newFile := toInterface(new_le_file); elsif bom = "\16#fe;\16#ff;" then new_be_file.ext_file := open_file; new_be_file.name := path; newFile := toInterface(new_be_file); else close(open_file); end if; end if; end func;