(********************************************************************)
(*                                                                  *)
(*  huffman.s7i   Support for Huffman coding.                       *)
(*  Copyright (C) 2021, 2022, 2024  Thomas Mertes                   *)
(*                                                                  *)
(*  This file is part of the Seed7 Runtime Library.                 *)
(*                                                                  *)
(*  The Seed7 Runtime Library is free software; you can             *)
(*  redistribute it and/or modify it under the terms of the GNU     *)
(*  Lesser General Public License as published by the Free Software *)
(*  Foundation; either version 2.1 of the License, or (at your      *)
(*  option) any later version.                                      *)
(*                                                                  *)
(*  The Seed7 Runtime Library is distributed in the hope that it    *)
(*  will be useful, but WITHOUT ANY WARRANTY; without even the      *)
(*  implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR *)
(*  PURPOSE.  See the GNU Lesser General Public License for more    *)
(*  details.                                                        *)
(*                                                                  *)
(*  You should have received a copy of the GNU Lesser General       *)
(*  Public License along with this program; if not, write to the    *)
(*  Free Software Foundation, Inc., 51 Franklin Street,             *)
(*  Fifth Floor, Boston, MA  02110-1301, USA.                       *)
(*                                                                  *)
(********************************************************************)


include "bitdata.s7i";


const type: huffmanSymbolArray is array [0 ..] integer;
const type: msbHuffmanCodeLengthArray is array [0 ..] integer;


(**
 *  Type to describe Huffman decoding from a [[bitdata#msbInBitStream|msbInBitStream]].
 *  This Huffman encoding is used by [[jpeg|JPEG]] files.
 *)
const type: msbHuffmanDecoder is new struct
    var integer: maximumCodeLength is 0;
    var huffmanSymbolArray: symbols is huffmanSymbolArray.value;
    var msbHuffmanCodeLengthArray: codeLengths is msbHuffmanCodeLengthArray.value;
  end struct;


(**
 *  Create a Huffman decoder for reading in MSB-First order.
 *  This Huffman encoding is used by [[jpeg|JPEG]] files. It can happen that
 *  ''symbols'' contains the same value twice. In that case
 *  the same symbol is encoded in two ways. This makes absolutely no
 *  sense but it can happen. For that reason it is necessary to use
 *  decoder.codeLengths with the same index as decoder.symbols.
 *  @param maximumCodeLength Maximum Huffman code length used.
 *  @param numberOfCodesWithLength Array to map bit width (index) to the number
 *                                 of symbols encoded with this bit width.
 *  @param huffmanSymbols String with symbols ordered by the bit width.
 *)
const func msbHuffmanDecoder: createMsbHuffmanDecoder (in integer: maximumCodeLength,
    in array integer: numberOfCodesWithLength, in string: huffmanSymbols) is func
  result
    var msbHuffmanDecoder: decoder is msbHuffmanDecoder.value;
  local
    var integer: currentCode is 0;
    var integer: codeLength is 0;
    var integer: symbolIndexStart is 0;
    var integer: symbolIndexEnd is 0;
    var integer: symbolIndex is 0;
    var integer: symbol is 0;
    var integer: tableIndex is 0;
  begin
    decoder.maximumCodeLength := maximumCodeLength;
    decoder.symbols := huffmanSymbolArray[.. pred(1 << maximumCodeLength)] times 0;
    decoder.codeLengths := msbHuffmanCodeLengthArray[.. pred(1 << maximumCodeLength)] times 0;
    for codeLength range 1 to maximumCodeLength do
      symbolIndexStart := succ(symbolIndexEnd);
      symbolIndexEnd +:= numberOfCodesWithLength[codeLength];
      for symbolIndex range symbolIndexStart to symbolIndexEnd do
        symbol := ord(huffmanSymbols[symbolIndex]);
        while currentCode = tableIndex >> (maximumCodeLength - codeLength) do
          decoder.symbols[tableIndex] := symbol;
          decoder.codeLengths[tableIndex] := codeLength;
          incr(tableIndex);
        end while;
        incr(currentCode);
      end for;
      currentCode <<:= 1;
    end for;
    symbol := ord(huffmanSymbols[length(huffmanSymbols)]);
    while tableIndex <= pred(1 << maximumCodeLength) do
      decoder.symbols[tableIndex] := symbol;
      decoder.codeLengths[tableIndex] := maximumCodeLength;
      incr(tableIndex);
    end while;
  end func;


# The function createHuffmanTableMsb() is deprecated. Use createMsbHuffmanDecoder() instead.
const func msbHuffmanDecoder: createHuffmanTableMsb (in integer: maximumCodeLength,
    in array integer: numberOfCodesWithLength, in string: orderedSymbols) is
  return createMsbHuffmanDecoder(maximumCodeLength, numberOfCodesWithLength, orderedSymbols);


(**
 *  Get a Huffman symbol from a [[bitdata#msbInBitStream|msbInBitStream]] using the Huffman ''decoder''.
 *  The read direction is from MSB (most significant bit) to LSB (least significant bit).
 *  The function peeks bits from ''inBitStream''. By default ''inBitStream'' appends some
 *  '\16#ff;' bytes to allow that bits can be peeked always.
 *   aSymbol := getHuffmanSymbol(compressedStream, huffmanDecoder);
 *  @param inBitStream MSB orderd bit stream from which the bits are skipped.
 *  @param decoder Huffman decoder defining the bit sequences that encode the symbols.
 *)
const func integer: getHuffmanSymbol (inout msbInBitStream: inBitStream,
    in msbHuffmanDecoder: decoder) is func
  result
    var integer: symbol is 0;
  local
    var integer: index is 0;
  begin
    index := peekBits(inBitStream, decoder.maximumCodeLength);
    symbol := decoder.symbols[index];
    skipBits(inBitStream, decoder.codeLengths[index]);
  end func;


const type: symbolsWithCodeLengthType is array array integer;


(**
 *  Compute lists of symbols (index of codeLength) ordered by code langth.
 *  The result is a two-dimensional array where the first index is a code length.
 *   symbolsWithCodeLength := computeSymbolsWithCodeLength(codeLengths);
 *   # Now symbolsWithCodeLength[2] is an array of symbols with length 2.
 *  @param codeLengths Array to map the symbols (index) to the number of bits used
 *                     to encode this symbol. Zero means: This symbol is not used.
 *  @return an array of symbol arrays with the code length as first index.
 *)
const func symbolsWithCodeLengthType: computeSymbolsWithCodeLength (in array integer: codeLengths) is func
  result
    var symbolsWithCodeLengthType: symbolsWithCodeLength is 0 times 0 times 0;
  local
    var integer: codeLength is 0;
    var integer: maximumCodeLength is 0;
    var integer: numberOfCodes is 0;
    var array integer: numberOfCodesWithLength is 0 times 0;
    var array integer: valueIndex is 0 times 0;
    var integer: symbol is 0;
  begin
    for codeLength range codeLengths do
      if codeLength > maximumCodeLength then
        maximumCodeLength := codeLength;
      end if;
    end for;
    numberOfCodesWithLength := [1 .. maximumCodeLength] times 0;
    for codeLength range codeLengths do
      if codeLength <> 0 then
        incr(numberOfCodesWithLength[codeLength]);
      end if;
    end for;
    symbolsWithCodeLength := [1 .. maximumCodeLength] times 0 times 0;
    for numberOfCodes key codeLength range numberOfCodesWithLength do
      if numberOfCodes <> 0 then
        symbolsWithCodeLength[codeLength] := numberOfCodes times 0;
      end if;
    end for;
    valueIndex := [1 .. maximumCodeLength] times 1;
    for codeLength key symbol range codeLengths do
      if codeLength <> 0 then
        symbolsWithCodeLength[codeLength][valueIndex[codeLength]] := symbol;
        incr(valueIndex[codeLength]);
      end if;
    end for;
  end func;


(**
 *  Type to describe Huffman decoding from a [[bitdata#lsbInBitStream|lsbInBitStream]].
 *  This Huffman decoding is used by the inflate (deflate) algorithm.
 *)
const type: lsbHuffmanDecoder is new struct
    var integer: maximumCodeLength is 0;
    var huffmanSymbolArray: symbols is huffmanSymbolArray.value;
    var array integer: codeLengths is 0 times 0;
  end struct;


const func lsbHuffmanDecoder: createLsbHuffmanDecoder (in array integer: codeLengths,
    in integer: maximumCodeLength,
    in symbolsWithCodeLengthType: symbolsWithCodeLength) is func
  result
    var lsbHuffmanDecoder: decoder is lsbHuffmanDecoder.value;
  local
    var integer: codeLength is 0;
    var integer: symbol is 0;
    var integer: currentCode is 0;
    var integer: reversedCode is 0;
    var integer: highBits is 0;
  begin
    decoder.maximumCodeLength := maximumCodeLength;
    decoder.symbols := huffmanSymbolArray[.. pred(1 << maximumCodeLength)] times 0;
    decoder.codeLengths := codeLengths;
    for codeLength range 1 to maximumCodeLength do
      for symbol range symbolsWithCodeLength[codeLength] do
        reversedCode := reverseBits(codeLength, currentCode);
        for highBits range 0 to pred(1 << maximumCodeLength) step 1 << codeLength do
          decoder.symbols[highBits + reversedCode] := symbol;
        end for;
        incr(currentCode);
      end for;
      currentCode <<:= 1;
    end for;
  end func;


(**
 *  Create a Huffman decoder for reading in LSB-First order.
 *  This Huffman decoding is used by the inflate (deflate) algorithm.
 *  Non-optimal Huffman encodings, where symbols are encoded with more
 *  bits than necessary, are accepted as well. The decoder is created
 *  as follows:
 *  E.g.: The code lengths (in bits) of
 *   4 0 0 6 5 3 3 3 3 3 4 3 0 0 0 0 5 5 6
 *  describe that 0 is encoded with 4 bits, 3 with 6 bits, etc.
 *  This leads to the following encoding lengths:
 *   length 3: (5, 6, 7, 8, 9, 11)
 *   length 4: (0, 10)
 *   length 5: (4, 16, 17)
 *   length 6: (3, 18)
 *  Beginning with the lowest length the following encodings are generated:
 *   000: 5
 *   001: 6
 *   ...
 *   101: 11
 *  For the next length (4 instead of 3) the value is incremented and shifted:
 *   1100: 0
 *  The decoder should be able fetch the maximum length of bits and to
 *  use it as index. In order to allow that the data must be transformed.
 *  The bits must be flipped and all variants of higher bits must be added:
 *   000000 encodes 5
 *   000001 encodes 9
 *   000010 encodes 7
 *   000011 encodes 0
 *   000100 encodes 6
 *   ...
 *   001000 encodes 5
 *   001001 encodes 9
 *   001010 encodes 7
 *   ...
 *  @param codeLengths Array to map the symbols (index) to the number of bits used
 *                     to encode this symbol. Zero means: This symbol is not used.
 *)
const func lsbHuffmanDecoder: createLsbHuffmanDecoder (in array integer: codeLengths) is func
  result
    var lsbHuffmanDecoder: decoder is lsbHuffmanDecoder.value;
  local
    var symbolsWithCodeLengthType: symbolsWithCodeLength is 0 times 0 times 0;
  begin
    symbolsWithCodeLength := computeSymbolsWithCodeLength(codeLengths);
    decoder := createLsbHuffmanDecoder(codeLengths, maxIdx(symbolsWithCodeLength),
                                       symbolsWithCodeLength);
  end func;


# The function createHuffmanTableLsb() is deprecated. use createLsbHuffmanDecoder() instead.
const func lsbHuffmanDecoder: createHuffmanTableLsb (in array integer: codeLengths) is
  return createLsbHuffmanDecoder(codeLengths);


(**
 *  Get a Huffman symbol from a [[bitdata#lsbInBitStream|lsbInBitStream]] using the Huffman ''decoder''.
 *  The read direction is from LSB (least significant bit) to MSB (most significant bit).
 *  The function peeks bits from ''inBitStream''. By default ''inBitStream'' appends some
 *  '\16#ff;' bytes to allow that bits can be peeked always.
 *   aSymbol := getHuffmanSymbol(compressedStream, huffmanDecoder);
 *  @param inBitStream LSB orderd bit stream from which the bits are skipped.
 *  @param decoder Huffman decoder defining the bit sequences that encode the symbols.
 *)
const func integer: getHuffmanSymbol (inout lsbInBitStream: inBitStream,
    in lsbHuffmanDecoder: decoder) is func
  result
    var integer: symbol is 0;
  local
    var integer: index is 0;
  begin
    index := peekBits(inBitStream, decoder.maximumCodeLength);
    symbol := decoder.symbols[index];
    skipBits(inBitStream, decoder.codeLengths[symbol]);
  end func;


(**
 *  Type to describe the Huffman encoding of one symbol.
 *)
const type: huffmanEncoding is new struct
    var integer: huffmanCode is 0;
    var integer: codeLength is 0;
  end struct;

(**
 *  Type to describe a Huffman encoder.
 *  The Huffman encoder can be used for writing in MSB-First or LSB-First order.
 *  This Huffman encoding is used by [[jpeg|JPEG]] files and by the
 *  deflate compression algorithm.
 *)
const type: huffmanEncoder is array huffmanEncoding;


(**
 *  Write a huffman symbol to a [[bitdata#lsbOutBitStream|lsbOutBitStream]] using the huffman ''outNode''.
 *  The write direction is from LSB (least significant bit) to MSB (most significant bit).
 *   putHuffmanSymbol(compressedStream, encoder[huffmanSymbol]);
 *  @param outBitStream LSB orderd bit stream to which the bits are written.
 *  @param encoding Huffman encoding which defines the actual huffmanCode and the codeLength.
 *)
const proc: putHuffmanSymbol (inout lsbOutBitStream: outBitStream, in huffmanEncoding: encoding) is func
  begin
    putBits(outBitStream, encoding.huffmanCode, encoding.codeLength);
  end func;


(**
 *  Write a huffman symbol to a [[bitdata#msbOutBitStream|msbOutBitStream]] using the huffman ''encoding''.
 *  The write direction is from MSB (most significant bit) to LSB (least significant bit).
 *   putHuffmanSymbol(compressedStream, encoder[huffmanSymbol]);
 *  @param outBitStream MSB orderd bit stream to which the bits are written.
 *  @param encoding Huffman encoding which defines the actual huffmanCode and the codeLength.
 *)
const proc: putHuffmanSymbol (inout msbOutBitStream: outBitStream, in huffmanEncoding: encoding) is func
  begin
    putBits(outBitStream, encoding.huffmanCode, encoding.codeLength);
  end func;


const type: huffmanSymbolNode is new struct
    var integer: count is 0;
    var integer: symbol is 0;
    var integer: fictiveNode is 0;
  end struct;

const func integer: compare (in huffmanSymbolNode: node1, in huffmanSymbolNode: node2) is
  return -compare(node1.count, node2.count);

const type: huffmanSymbolNodeArray is array huffmanSymbolNode;


const type: huffmanTreeNode is new struct
    var integer: leftTreeNode is 0;
    var integer: rightTreeNode is 0;
    var integer: leftSymbol is 0;
    var integer: rightSymbol is 0;
  end struct;

const type: huffmanTreeNodeArray is array huffmanTreeNode;


const func huffmanSymbolNodeArray: getHuffmanSymbolNodes (in array integer: symbolCount) is func
  result
    var huffmanSymbolNodeArray: symbolNodes is huffmanSymbolNodeArray.value;
  local
    var integer: count is 0;
    var integer: symbol is 0;
    var integer: index is 1;
  begin
    symbolNodes := length(symbolCount) times huffmanSymbolNode.value;
    for count key symbol range symbolCount do
      symbolNodes[index].count := count;
      symbolNodes[index].symbol := symbol;
      incr(index);
    end for;
    symbolNodes := sort(symbolNodes);
  end func;


const proc: getLengthsFromTree (inout array integer: codeLengths,
    in huffmanTreeNode: treeNode, in huffmanTreeNodeArray: treeNodes,
    in integer: codeLength) is func
  begin
    if treeNode.leftTreeNode <> 0 then
      getLengthsFromTree(codeLengths, treeNodes[treeNode.leftTreeNode],
                         treeNodes, succ(codeLength));
    else
      codeLengths[treeNode.leftSymbol] := succ(codeLength);
    end if;
    if treeNode.rightTreeNode <> 0 then
      getLengthsFromTree(codeLengths, treeNodes[treeNode.rightTreeNode],
                         treeNodes, succ(codeLength));
    else
      codeLengths[treeNode.rightSymbol] := succ(codeLength);
    end if;
  end func;


const proc: getLengthsFromTree (inout array integer: codeLengths,
    in huffmanSymbolNode: symbolNode, in huffmanTreeNodeArray: treeNodes,
    in integer: codeLength) is func
  begin
    if symbolNode.fictiveNode <> 0 then
      getLengthsFromTree(codeLengths, treeNodes[symbolNode.fictiveNode],
                         treeNodes, codeLength);
    else
      codeLengths[symbolNode.symbol] := succ(codeLength);
    end if;
  end func;


(**
 *  Create an array of code lengths from the array ''symbolCount''.
 *  The indices of the ''symbolCount'' array are the symbols to be huffman encoded.
 *  The indices of the returned code lengths array are the symbols to be huffman encoded.
 *  @param symbolCount Array of occurances of the corresponding symbol (index).
 *  @return an array of code lengths of the corresponding symbol (index).
 *)
const func array integer: getHuffmanCodeLengths (in array integer: symbolCount) is func
  result
    var array integer: codeLengths is 0 times 0;
  local
    var huffmanSymbolNodeArray: symbolNodes is huffmanSymbolNodeArray.value;
    var huffmanTreeNodeArray: treeNodes is huffmanTreeNodeArray.value;
    var huffmanTreeNode: treeNode is huffmanTreeNode.value;
    var integer: index is 0;
  begin
    symbolNodes := getHuffmanSymbolNodes(symbolCount);
    index := length(symbolNodes);
    while symbolNodes[index].count = 0 and
        index > minIdx(symbolNodes) do
      decr(index);
    end while;
    while index > 1 do
      treeNode := huffmanTreeNode.value;
      if symbolNodes[pred(index)].fictiveNode <> 0 then
        treeNode.leftTreeNode :=
            symbolNodes[pred(index)].fictiveNode;
      else
        treeNode.leftSymbol := symbolNodes[pred(index)].symbol;
      end if;
      if symbolNodes[index].fictiveNode <> 0 then
        treeNode.rightTreeNode :=
            symbolNodes[index].fictiveNode;
      else
        treeNode.rightSymbol := symbolNodes[index].symbol;
      end if;
      treeNodes &:= treeNode;
      symbolNodes[pred(index)].count +:= symbolNodes[index].count;
      symbolNodes[pred(index)].fictiveNode := length(treeNodes);
      symbolNodes[index].count := 0;
      symbolNodes := sort(symbolNodes);
      decr(index);
    end while;
    codeLengths := [minIdx(symbolCount) .. maxIdx(symbolCount)] times 0;
    getLengthsFromTree(codeLengths, symbolNodes[1], treeNodes, 0);
  end func;


(**
 *  Reduce the maximum code length of an Huffman encoding to ''allowedMaximum''.
 *  The given ''codeLengths'' array is changed to fit to the ''allowedMaximum''.
 *  @param codeLengths Array to map the symbols (index) to the number of bits used
 *                     to encode this symbol. Zero means: This symbol is not used.
 *  @param allowedMaximum Target maximum code length of the Huffman encoding.
 *)
const proc: reduceMaximumHuffmanCodeLength (inout array integer: codeLengths,
    in integer: allowedMaximum) is func
  local
    var symbolsWithCodeLengthType: symbolsWithCodeLength is 0 times 0 times 0;
    var integer: maximumCodeLength is 0;
    var integer: shorterCodeLength is 0;
    var integer: longerCodeLength is 0;
    var integer: symbolsMovingUp is 0;
    var boolean: moveDone is FALSE;
    var integer: index is 0;
    var integer: symbol is 0;
  begin
    symbolsWithCodeLength := computeSymbolsWithCodeLength(codeLengths);
    maximumCodeLength := maxIdx(symbolsWithCodeLength);
    repeat
      shorterCodeLength := maximumCodeLength - 2;
      moveDone := FALSE;
      repeat
        while length(symbolsWithCodeLength[shorterCodeLength]) = 0 do
          decr(shorterCodeLength);
        end while;
        longerCodeLength := shorterCodeLength + 2;
        symbolsMovingUp := 2;
        while longerCodeLength <= maximumCodeLength and
            length(symbolsWithCodeLength[longerCodeLength]) < symbolsMovingUp do
          incr(longerCodeLength);
          symbolsMovingUp *:= 2;
        end while;
        if longerCodeLength <= maximumCodeLength then
          # Move one symbol with shorterCodeLength down (code length + 1).
          # This creates a free place at the level below.
          # Because of fictive nodes two symbols can move up to this free place.
	  # Alternatively at lower levels 4, 8, 16 etc. symbols can move up.
          symbol := symbolsWithCodeLength[shorterCodeLength][1];
          symbolsWithCodeLength[succ(shorterCodeLength)] &:= symbol;
          codeLengths[symbol] := succ(shorterCodeLength);
          # Remove the moved symbol from the shorterCodeLength level.
          symbolsWithCodeLength[shorterCodeLength] :=
              symbolsWithCodeLength[shorterCodeLength][2 ..];
          # Move symbols with a longerCodeLength up (code length - 1).
          # Depending on the difference between shorterCodeLength and
          # longerCodeLength symbolsMovingUp (2, 4, 8, etc.) symbols can move up.
          for index range 1 to symbolsMovingUp do
            symbol := symbolsWithCodeLength[longerCodeLength][index];
            symbolsWithCodeLength[pred(longerCodeLength)] &:= symbol;
            codeLengths[symbol] := pred(longerCodeLength);
          end for;
          # Remove the moved symbols from the longerCodeLength level.
          symbolsWithCodeLength[longerCodeLength] :=
              symbolsWithCodeLength[longerCodeLength][succ(symbolsMovingUp) ..];
          moveDone := TRUE;
        else
          decr(shorterCodeLength);
        end if;
      until moveDone;
      if length(symbolsWithCodeLength[maximumCodeLength]) = 0 then
        decr(maximumCodeLength);
      end if;
    until maximumCodeLength <= allowedMaximum;
  end func;


const func huffmanEncoder: createHuffmanEncoder (in array integer: codeLengths,
    in integer: maximumCodeLength,
    in symbolsWithCodeLengthType: symbolsWithCodeLength) is func
  result
    var huffmanEncoder: encoder is huffmanEncoder.value;
  local
    var integer: codeLength is 0;
    var integer: symbol is 0;
    var integer: currentCode is 0;
  begin
    encoder := [minIdx(codeLengths) .. maxIdx(codeLengths)] times huffmanEncoding.value;
    for codeLength range 1 to maximumCodeLength do
      for symbol range symbolsWithCodeLength[codeLength] do
        encoder[symbol].huffmanCode := reverseBits(codeLength, currentCode);
        encoder[symbol].codeLength := codeLength;
        incr(currentCode);
      end for;
      currentCode <<:= 1;
    end for;
  end func;


(**
 *  Create a ''huffmanEncoder'' from the given ''codeLengths'' array.
 *  The Huffman encoder can be used for writing in MSB-First or LSB-First order.
 *  This Huffman encoding is used by [[jpeg|JPEG]] files and by the
 *  deflate compression algorithm.
 *  @param codeLengths Array to map the symbols (index) to the number of bits used
 *                     to encode this symbol. Zero means: This symbol is not used.
 *)
const func huffmanEncoder: createHuffmanEncoder (in array integer: codeLengths) is func
  result
    var huffmanEncoder: encoder is huffmanEncoder.value;
  local
    var symbolsWithCodeLengthType: symbolsWithCodeLength is 0 times 0 times 0;
  begin
    symbolsWithCodeLength := computeSymbolsWithCodeLength(codeLengths);
    encoder := createHuffmanEncoder(codeLengths, maxIdx(symbolsWithCodeLength),
                                    symbolsWithCodeLength);
  end func;