Revision 14d665b3e84ac3164977b801b0003efe5d754d1a authored by Chris Lattner on 17 January 2016, 20:40:43 UTC, committed by Doug Gregor on 25 January 2016, 18:55:45 UTC
1 parent f2cd3f0
Compression.cpp
//===--- Compression.cpp - Compression of symbols -------------------------===//
//
// This source file is part of the Swift.org open source project
//
// Copyright (c) 2014 - 2016 Apple Inc. and the Swift project authors
// Licensed under Apache License v2.0 with Runtime Library Exception
//
// See http://swift.org/LICENSE.txt for license information
// See http://swift.org/CONTRIBUTORS.txt for the list of Swift project authors
//
//===----------------------------------------------------------------------===//
#include "swift/ABI/Compression.h"
#include "llvm/Support/ErrorHandling.h"
#include "CBCTables.h"
#include "HuffTables.h"
#include <assert.h>
using namespace llvm;
using namespace swift;
using EncodingKind = swift::Compress::EncodingKind;
static unsigned CBCindexOfChar(char c) {
int idx = CBC::IndexOfChar[int(c)];
assert(idx >= 0 && "Invalid char");
return (unsigned) idx;
}
std::string swift::Compress::DecodeCBCString(StringRef In) {
unsigned EndIndex = In.size();
// The String Builder.
std::string SB;
SB.reserve(In.size());
// Processed Index - The index of the first non-processed char.
unsigned PI = 0;
while (true) {
if (PI >= EndIndex) break;
const char c = In[PI];
if (c == CBC::EscapeChar0) {
if ((PI+1) >= EndIndex) {
llvm_unreachable("Invalid Encoding.");
return "";
}
const char N = In[PI+1];
if (N == CBC::EscapeChar0 || N == CBC::EscapeChar1) {
SB += N;
PI += 2;
continue;
}
unsigned Idx = CBCindexOfChar(N);
if (Idx > CBC::CharsetLength || CBC::Charset[Idx] != N) {
llvm_unreachable("Decoded invalid character.");
return "";
}
SB += CBC::CodeBook[Idx];
PI += 2;
continue;
}
if (c == CBC::EscapeChar1) {
if ((PI+2) >= EndIndex) {
llvm_unreachable("Decoded invalid index.");
return "";
}
const char N0 = In[PI+1];
const char N1 = In[PI+2];
unsigned JointIndex = (CBC::CharsetLength * CBCindexOfChar(N0)) +
CBCindexOfChar(N1);
if (JointIndex > CBC::NumFragments) {
llvm_unreachable("Decoded invalid index.");
return "";
}
SB += CBC::CodeBook[JointIndex];
PI += 3;
continue;
}
// We did not find a match. Just decode the character.
SB += c;
PI++;
}
return SB;
}
std::string swift::Compress::EncodeCBCString(StringRef In) {
unsigned EndIndex = In.size();
// The String Builder.
std::string SB;
SB.reserve(In.size());
// Processed Index - The index of the first non-processed char.
unsigned PI = 0;
while (true) {
StartMatch:
if (PI >= EndIndex) break;
const char c = In[PI];
if (c == CBC::EscapeChar0 || c == CBC::EscapeChar1) {
SB += CBC::EscapeChar0;
SB += c;
PI++;
goto StartMatch;
}
// The number of unprocessed chars.
unsigned DistanceToEnd = EndIndex - PI;
int FragIdx = CBC::matchStringSuffix(In.data() + PI, DistanceToEnd);
if (FragIdx >= 0){
unsigned FragmentLength = CBC::CodeBookLen[FragIdx];
// Try encoding using a single escape character.
if (FragIdx < int(CBC::CharsetLength)) {
SB += CBC::EscapeChar0;
SB += CBC::Charset[FragIdx];
PI += FragmentLength;
goto StartMatch;
}
// Try encoding using two escape character. Make sure that the encoding
// is profitable.
if (FragmentLength > 3) {
SB += CBC::EscapeChar1;
SB += CBC::Charset[FragIdx / CBC::CharsetLength];
SB += CBC::Charset[FragIdx % CBC::CharsetLength];
PI += FragmentLength;
goto StartMatch;
}
}
// We did not find a match. Just save the character. :(
SB += c;
PI++;
}
return SB;
}
/// This function computes the number of characters that we can encode in a
/// 64bit number without overflowing.
///
/// \returns a pair of:
/// - The number of characters that fit in a 64bit word.
/// - The value of CharsetLength^NumLetters (CL to the power of NL).
static std::pair<uint64_t, uint64_t> get64bitEncodingParams() {
uint64_t CL = Huffman::CharsetLength;
// We encode each letter using Log2(CL) bits, and the number of letters we can
// encode is a 64bit number is 64/Log2(CL).
//
// Doing this computation in floating-point arithmetic could give a slightly
// better (more optimistic) result, but the computation may not be constant
// at compile time.
uint64_t NumLetters = 64 / Log2_64_Ceil(CL);
// Calculate CharsetLength^NumLetters (CL to the power of NL), which is the
// highest numeric value that can hold NumLetters characters in a 64bit
// number.
//
// Notice: this loop is optimized away and CLX is computed to a
// constant integer at compile time.
uint64_t CLX = 1;
for (unsigned i = 0; i < NumLetters; i++) { CLX *= CL; }
return std::make_pair(NumLetters, CLX);
}
/// Extract all of the characters from the number \p Num one by one and
/// insert them into the string builder \p SB.
static void DecodeFixedWidth(APInt &Num, std::string &SB) {
uint64_t CL = Huffman::CharsetLength;
assert(Num.getBitWidth() > 8 &&
"Not enough bits for arithmetic on this alphabet");
// Try to decode eight numbers at once. It is much faster to work with
// local 64bit numbers than working with APInt. In this loop we try to
// extract NL characters at once and process them using a local 64-bit
// number.
uint64_t CLX;
uint64_t NumLetters;
std::tie(NumLetters, CLX) = get64bitEncodingParams();
while (Num.ugt(CLX)) {
unsigned BW = Num.getBitWidth();
APInt C = APInt(BW, CLX);
APInt Quotient(1, 0), Remainder(1, 0);
APInt::udivrem(Num, C, Quotient, Remainder);
// Try to reduce the bitwidth of the API after the division. This can
// accelerate the division operation in future iterations because the
// number becomes smaller (fewer bits) with each iteration. However,
// We can't reduce the number to something too small because we still
// need to be able to perform the "mod charset_length" operation.
Num = Quotient.zextOrTrunc(std::max(Quotient.getActiveBits(), 64u));
uint64_t Tail = Remainder.getZExtValue();
for (unsigned i = 0; i < NumLetters; i++) {
SB += Huffman::Charset[Tail % CL];
Tail = Tail / CL;
}
}
// Pop characters out of the APInt one by one.
while (Num.getBoolValue()) {
unsigned BW = Num.getBitWidth();
APInt C = APInt(BW, CL);
APInt Quotient(1, 0), Remainder(1, 0);
APInt::udivrem(Num, C, Quotient, Remainder);
Num = Quotient;
SB += Huffman::Charset[Remainder.getZExtValue()];
}
}
static unsigned HuffIndexOfChar(char c) {
int idx = Huffman::IndexOfChar[int(c)];
assert(idx >= 0 && "Invalid char");
return (unsigned) idx;
}
APInt
swift::Compress::EncodeStringAsNumber(StringRef In, EncodingKind Kind) {
// Allocate enough space for the first character plus one bit which is the
// stop bit for variable length encoding.
unsigned BW = (1 + Huffman::LongestEncodingLength);
APInt num = APInt(BW, 0);
// We set the high bit to zero in order to support encoding
// of chars that start with zero (for variable length encoding).
if (Kind == EncodingKind::Variable) {
num = ++num;
}
// Encode variable-length strings.
if (Kind == EncodingKind::Variable) {
uint64_t num_bits = 0;
uint64_t bits = 0;
// Append the characters in the string in reverse. This will allow
// us to decode by appending to a string and not prepending.
for (int i = In.size() - 1; i >= 0; i--) {
char ch = In[i];
// The local variables 'bits' and 'num_bits' are used as a small
// bitstream. Keep accumulating bits into them until they overflow.
// At that point move them into the APInt.
uint64_t local_bits;
uint64_t local_num_bits;
// Find the huffman encoding of the character.
Huffman::variable_encode(local_bits, local_num_bits, ch);
// Add the encoded character into our bitstream.
num_bits += local_num_bits;
bits = (bits << local_num_bits) + local_bits;
// Check if there is enough room for another word. If not, flush
// the local bitstream into the APInt.
if (num_bits >= (64 - Huffman::LongestEncodingLength)) {
// Make room for the new bits and add the bits.
num = num.zext(num.getBitWidth() + num_bits);
num = num.shl(num_bits); num = num + bits;
num_bits = 0; bits = 0;
}
}
// Flush the local bitstream into the APInt number.
if (num_bits) {
num = num.zext(num.getBitWidth() + num_bits);
num = num.shl(num_bits); num = num + bits;
num_bits = 0; bits = 0;
}
// Make sure that we have a minimal word size to be able to perform
// calculations on our alphabet.
return num.zextOrSelf(std::max(64u, num.getBitWidth()));
}
// Encode fixed width strings.
// Try to decode a few numbers at once. First encode characters into a local
// variable and then encode that variable. It is much faster to work with
// local 64-bit numbers than APInts.
uint64_t CL = Huffman::CharsetLength;
uint64_t CLX;
uint64_t maxNumLetters;
std::tie(maxNumLetters, CLX) = get64bitEncodingParams();
uint64_t numEncodedChars = 0;
uint64_t encodedCharsValue = 0;
for (int i = In.size() - 1; i >= 0; i--) {
// Encode a single char into the local temporary variable.
unsigned charIdx = HuffIndexOfChar(In[i]);
encodedCharsValue = encodedCharsValue * CL + charIdx;
numEncodedChars++;
// If we've reached the maximum capacity then push the value into the APInt.
if (numEncodedChars == maxNumLetters) {
num = num.zextOrSelf(num.getActiveBits() + 64);
num *= APInt(num.getBitWidth(), CLX);
num += APInt(num.getBitWidth(), encodedCharsValue);
numEncodedChars = 0;
encodedCharsValue = 0;
}
}
// Encode the last few characters.
if (numEncodedChars) {
// Compute the value that we need to multiply the APInt to make room for the
// last few characters that did not make a complete 64-bit value.
uint64_t tailCLX = 1;
for (unsigned i = 0; i < numEncodedChars; i++) { tailCLX *= CL; }
num = num.zextOrSelf(num.getActiveBits() + 64);
num *= APInt(num.getBitWidth(), tailCLX);
num += APInt(num.getBitWidth(), encodedCharsValue);
}
return num;
}
std::string swift::Compress::DecodeStringFromNumber(const APInt &In,
EncodingKind Kind) {
APInt num = In;
std::string sb;
// This is the max number of bits that we can hold in a 64bit number without
// overflowing in the next round of character decoding.
unsigned MaxBitsPerWord = 64 - Huffman::LongestEncodingLength;
if (Kind == EncodingKind::Variable) {
// Keep decoding until we reach our sentinel value.
// See the encoder implementation for more details.
while (num.ugt(1)) {
// Try to decode a bunch of characters together without modifying the
// big number.
if (num.getActiveBits() > 64) {
// Collect the bottom 64-bit.
uint64_t tailbits = *num.getRawData();
// This variable is used to record the number of bits that were
// extracted from the lowest 64 bit of the big number.
unsigned bits = 0;
// Keep extracting bits from the tail of the APInt until you reach
// the end of the word (64 bits minus the size of the largest
// character possible).
while (bits < MaxBitsPerWord) {
char ch;
unsigned local_bits;
std::tie(ch, local_bits) = Huffman::variable_decode(tailbits);
sb += ch;
tailbits >>= local_bits;
bits += local_bits;
}
// Now that we've extracted a few characters from the tail of the APInt
// we need to shift the APInt and prepare for the next round. We shift
// the APInt by the number of bits that we extracted in the loop above.
num = num.lshr(bits);
bits = 0;
} else {
// We don't have enough bits in the big num in order to extract a few
// numbers at once, so just extract a single character.
uint64_t tailbits = *num.getRawData();
char ch;
unsigned bits;
std::tie(ch, bits) = Huffman::variable_decode(tailbits);
sb += ch;
num = num.lshr(bits);
}
}
} else {
// Decode this number as a regular fixed-width sequence of characters.
DecodeFixedWidth(num, sb);
}
return sb;
}
std::string swift::Compress::CompressName(StringRef In) {
std::string cbc = EncodeCBCString(In);
APInt num = EncodeStringAsNumber(cbc, EncodingKind::Variable);
return DecodeStringFromNumber(num, EncodingKind::Fixed);
}
std::string swift::Compress::DecompressName(StringRef In) {
APInt num = EncodeStringAsNumber(In, EncodingKind::Fixed);
std::string str = DecodeStringFromNumber(num, EncodingKind::Variable);
return DecodeCBCString(str);
}
Computing file changes ...