blob: 99887352c91605515948a14fe30542c3b2a520d4 [file] [log] [blame]
/****************************************************************************
**
** Copyright (C) 2015 The Qt Company Ltd.
** Copyright (C) 2013 Intel Corporation
** Contact: http://www.qt.io/licensing/
**
** This file is part of the QtCore module of the Qt Toolkit.
**
** $QT_BEGIN_LICENSE:LGPL21$
** Commercial License Usage
** Licensees holding valid commercial Qt licenses may use this file in
** accordance with the commercial license agreement provided with the
** Software or, alternatively, in accordance with the terms contained in
** a written agreement between you and The Qt Company. For licensing terms
** and conditions see http://www.qt.io/terms-conditions. For further
** information use the contact form at http://www.qt.io/contact-us.
**
** GNU Lesser General Public License Usage
** Alternatively, this file may be used under the terms of the GNU Lesser
** General Public License version 2.1 or version 3 as published by the Free
** Software Foundation and appearing in the file LICENSE.LGPLv21 and
** LICENSE.LGPLv3 included in the packaging of this file. Please review the
** following information to ensure the GNU Lesser General Public License
** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
**
** As a special exception, The Qt Company gives you certain additional
** rights. These rights are described in The Qt Company LGPL Exception
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
**
** $QT_END_LICENSE$
**
****************************************************************************/
#ifndef QUTFCODEC_P_H
#define QUTFCODEC_P_H
//
// W A R N I N G
// -------------
//
// This file is not part of the Qt API. It exists purely as an
// implementation detail. This header file may change from version to
// version without notice, or even be removed.
//
// We mean it.
//
#include "QtCore/qtextcodec.h"
#include "private/qtextcodec_p.h"
QT_BEGIN_NAMESPACE
struct QUtf8BaseTraits
{
static const bool isTrusted = false;
static const bool allowNonCharacters = true;
static const bool skipAsciiHandling = false;
static const int Error = -1;
static const int EndOfString = -2;
static bool isValidCharacter(uint u)
{ return int(u) >= 0; }
static void appendByte(uchar *&ptr, uchar b)
{ *ptr++ = b; }
static uchar peekByte(const uchar *ptr, int n = 0)
{ return ptr[n]; }
static qptrdiff availableBytes(const uchar *ptr, const uchar *end)
{ return end - ptr; }
static void advanceByte(const uchar *&ptr, int n = 1)
{ ptr += n; }
static void appendUtf16(ushort *&ptr, ushort uc)
{ *ptr++ = uc; }
static void appendUcs4(ushort *&ptr, uint uc)
{
appendUtf16(ptr, QChar::highSurrogate(uc));
appendUtf16(ptr, QChar::lowSurrogate(uc));
}
static ushort peekUtf16(const ushort *ptr, int n = 0)
{ return ptr[n]; }
static qptrdiff availableUtf16(const ushort *ptr, const ushort *end)
{ return end - ptr; }
static void advanceUtf16(const ushort *&ptr, int n = 1)
{ ptr += n; }
// it's possible to output to UCS-4 too
static void appendUtf16(uint *&ptr, ushort uc)
{ *ptr++ = uc; }
static void appendUcs4(uint *&ptr, uint uc)
{ *ptr++ = uc; }
};
struct QUtf8BaseTraitsNoAscii : public QUtf8BaseTraits
{
static const bool skipAsciiHandling = true;
};
namespace QUtf8Functions
{
/// returns 0 on success; errors can only happen if \a u is a surrogate:
/// Error if \a u is a low surrogate;
/// if \a u is a high surrogate, Error if the next isn't a low one,
/// EndOfString if we run into the end of the string.
template <typename Traits, typename OutputPtr, typename InputPtr> inline
int toUtf8(ushort u, OutputPtr &dst, InputPtr &src, InputPtr end)
{
if (!Traits::skipAsciiHandling && u < 0x80) {
// U+0000 to U+007F (US-ASCII) - one byte
Traits::appendByte(dst, uchar(u));
return 0;
} else if (u < 0x0800) {
// U+0080 to U+07FF - two bytes
// first of two bytes
Traits::appendByte(dst, 0xc0 | uchar(u >> 6));
} else {
if (!QChar::isSurrogate(u)) {
// U+0800 to U+FFFF (except U+D800-U+DFFF) - three bytes
if (!Traits::allowNonCharacters && QChar::isNonCharacter(u))
return Traits::Error;
// first of three bytes
Traits::appendByte(dst, 0xe0 | uchar(u >> 12));
} else {
// U+10000 to U+10FFFF - four bytes
// need to get one extra codepoint
if (Traits::availableUtf16(src, end) == 0)
return Traits::EndOfString;
ushort low = Traits::peekUtf16(src);
if (!QChar::isHighSurrogate(u))
return Traits::Error;
if (!QChar::isLowSurrogate(low))
return Traits::Error;
Traits::advanceUtf16(src);
uint ucs4 = QChar::surrogateToUcs4(u, low);
if (!Traits::allowNonCharacters && QChar::isNonCharacter(ucs4))
return Traits::Error;
// first byte
Traits::appendByte(dst, 0xf0 | (uchar(ucs4 >> 18) & 0xf));
// second of four bytes
Traits::appendByte(dst, 0x80 | (uchar(ucs4 >> 12) & 0x3f));
// for the rest of the bytes
u = ushort(ucs4);
}
// second to last byte
Traits::appendByte(dst, 0x80 | (uchar(u >> 6) & 0x3f));
}
// last byte
Traits::appendByte(dst, 0x80 | (u & 0x3f));
return 0;
}
inline bool isContinuationByte(uchar b)
{
return (b & 0xc0) == 0x80;
}
/// returns the number of characters consumed (including \a b) in case of success;
/// returns negative in case of error: Traits::Error or Traits::EndOfString
template <typename Traits, typename OutputPtr, typename InputPtr> inline
int fromUtf8(uchar b, OutputPtr &dst, InputPtr &src, InputPtr end)
{
int charsNeeded;
uint min_uc;
uint uc;
if (!Traits::skipAsciiHandling && b < 0x80) {
// US-ASCII
Traits::appendUtf16(dst, b);
return 1;
}
if (!Traits::isTrusted && Q_UNLIKELY(b <= 0xC1)) {
// an UTF-8 first character must be at least 0xC0
// however, all 0xC0 and 0xC1 first bytes can only produce overlong sequences
return Traits::Error;
} else if (b < 0xe0) {
charsNeeded = 2;
min_uc = 0x80;
uc = b & 0x1f;
} else if (b < 0xf0) {
charsNeeded = 3;
min_uc = 0x800;
uc = b & 0x0f;
} else if (b < 0xf5) {
charsNeeded = 4;
min_uc = 0x10000;
uc = b & 0x07;
} else {
// the last Unicode character is U+10FFFF
// it's encoded in UTF-8 as "\xF4\x8F\xBF\xBF"
// therefore, a byte higher than 0xF4 is not the UTF-8 first byte
return Traits::Error;
}
int bytesAvailable = Traits::availableBytes(src, end);
if (Q_UNLIKELY(bytesAvailable < charsNeeded - 1)) {
// it's possible that we have an error instead of just unfinished bytes
if (bytesAvailable > 0 && !isContinuationByte(Traits::peekByte(src, 0)))
return Traits::Error;
if (bytesAvailable > 1 && !isContinuationByte(Traits::peekByte(src, 1)))
return Traits::Error;
if (bytesAvailable > 2 && !isContinuationByte(Traits::peekByte(src, 2)))
return Traits::Error;
return Traits::EndOfString;
}
// first continuation character
b = Traits::peekByte(src, 0);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
if (charsNeeded > 2) {
// second continuation character
b = Traits::peekByte(src, 1);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
if (charsNeeded > 3) {
// third continuation character
b = Traits::peekByte(src, 2);
if (!isContinuationByte(b))
return Traits::Error;
uc <<= 6;
uc |= b & 0x3f;
}
}
// we've decoded something; safety-check it
if (!Traits::isTrusted) {
if (uc < min_uc)
return Traits::Error;
if (QChar::isSurrogate(uc) || uc > QChar::LastValidCodePoint)
return Traits::Error;
if (!Traits::allowNonCharacters && QChar::isNonCharacter(uc))
return Traits::Error;
}
// write the UTF-16 sequence
if (!QChar::requiresSurrogates(uc)) {
// UTF-8 decoded and no surrogates are required
// detach if necessary
Traits::appendUtf16(dst, ushort(uc));
} else {
// UTF-8 decoded to something that requires a surrogate pair
Traits::appendUcs4(dst, uc);
}
Traits::advanceByte(src, charsNeeded - 1);
return charsNeeded;
}
}
enum DataEndianness
{
DetectEndianness,
BigEndianness,
LittleEndianness
};
struct QUtf8
{
static QString convertToUnicode(const char *, int);
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *);
static QByteArray convertFromUnicode(const QChar *, int);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *);
};
struct QUtf16
{
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
};
struct QUtf32
{
static QString convertToUnicode(const char *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
static QByteArray convertFromUnicode(const QChar *, int, QTextCodec::ConverterState *, DataEndianness = DetectEndianness);
};
#ifndef QT_NO_TEXTCODEC
class QUtf8Codec : public QTextCodec {
public:
~QUtf8Codec();
QByteArray name() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
QString convertToUnicode(const char *, int, ConverterState *) const Q_DECL_OVERRIDE;
QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const Q_DECL_OVERRIDE;
void convertToUnicode(QString *target, const char *, int, ConverterState *) const;
};
class QUtf16Codec : public QTextCodec {
protected:
public:
QUtf16Codec() { e = DetectEndianness; }
~QUtf16Codec();
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
QString convertToUnicode(const char *, int, ConverterState *) const Q_DECL_OVERRIDE;
QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const Q_DECL_OVERRIDE;
protected:
DataEndianness e;
};
class QUtf16BECodec : public QUtf16Codec {
public:
QUtf16BECodec() : QUtf16Codec() { e = BigEndianness; }
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
};
class QUtf16LECodec : public QUtf16Codec {
public:
QUtf16LECodec() : QUtf16Codec() { e = LittleEndianness; }
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
};
class QUtf32Codec : public QTextCodec {
public:
QUtf32Codec() { e = DetectEndianness; }
~QUtf32Codec();
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
QString convertToUnicode(const char *, int, ConverterState *) const Q_DECL_OVERRIDE;
QByteArray convertFromUnicode(const QChar *, int, ConverterState *) const Q_DECL_OVERRIDE;
protected:
DataEndianness e;
};
class QUtf32BECodec : public QUtf32Codec {
public:
QUtf32BECodec() : QUtf32Codec() { e = BigEndianness; }
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
};
class QUtf32LECodec : public QUtf32Codec {
public:
QUtf32LECodec() : QUtf32Codec() { e = LittleEndianness; }
QByteArray name() const Q_DECL_OVERRIDE;
QList<QByteArray> aliases() const Q_DECL_OVERRIDE;
int mibEnum() const Q_DECL_OVERRIDE;
};
#endif // QT_NO_TEXTCODEC
QT_END_NAMESPACE
#endif // QUTFCODEC_P_H