2015-12-10 05:06:13 +02:00
|
|
|
/****************************************************************************
|
|
|
|
**
|
|
|
|
** Copyright (C) 2015 The Qt Company Ltd.
|
2019-06-03 14:21:30 +00:00
|
|
|
** Copyright (C) 2016-2019 Ivailo Monev
|
2015-12-10 05:06:13 +02:00
|
|
|
**
|
2019-07-02 18:13:44 +00:00
|
|
|
** This file is part of the QtCore module of the Katie Toolkit.
|
2015-12-10 05:06:13 +02:00
|
|
|
**
|
|
|
|
** $QT_BEGIN_LICENSE:LGPL$
|
|
|
|
** GNU Lesser General Public License Usage
|
|
|
|
** Alternatively, this file may be used under the terms of the GNU Lesser
|
|
|
|
** General Public License version 2.1 or version 3 as published by the Free
|
|
|
|
** Software Foundation and appearing in the file LICENSE.LGPLv21 and
|
|
|
|
** LICENSE.LGPLv3 included in the packaging of this file. Please review the
|
|
|
|
** following information to ensure the GNU Lesser General Public License
|
|
|
|
** requirements will be met: https://www.gnu.org/licenses/lgpl.html and
|
|
|
|
** http://www.gnu.org/licenses/old-licenses/lgpl-2.1.html.
|
|
|
|
**
|
|
|
|
** As a special exception, The Qt Company gives you certain additional
|
|
|
|
** rights. These rights are described in The Qt Company LGPL Exception
|
|
|
|
** version 1.1, included in the file LGPL_EXCEPTION.txt in this package.
|
|
|
|
**
|
|
|
|
** GNU General Public License Usage
|
|
|
|
** Alternatively, this file may be used under the terms of the GNU
|
|
|
|
** General Public License version 3.0 as published by the Free Software
|
|
|
|
** Foundation and appearing in the file LICENSE.GPL included in the
|
|
|
|
** packaging of this file. Please review the following information to
|
|
|
|
** ensure the GNU General Public License version 3.0 requirements will be
|
|
|
|
** met: http://www.gnu.org/copyleft/gpl.html.
|
|
|
|
**
|
|
|
|
** $QT_END_LICENSE$
|
|
|
|
**
|
|
|
|
****************************************************************************/
|
2019-07-15 20:10:49 +00:00
|
|
|
|
|
|
|
#include "qtextboundaryfinder.h"
|
2019-12-14 17:45:40 +00:00
|
|
|
|
|
|
|
#include <unicode/ubrk.h>
|
2015-12-10 05:06:13 +02:00
|
|
|
|
|
|
|
QT_BEGIN_NAMESPACE
|
|
|
|
|
|
|
|
class QTextBoundaryFinderPrivate
|
|
|
|
{
|
|
|
|
public:
|
2019-12-14 17:45:40 +00:00
|
|
|
QTextBoundaryFinderPrivate();
|
|
|
|
QTextBoundaryFinderPrivate(const QTextBoundaryFinderPrivate &other);
|
|
|
|
|
2017-08-07 17:15:37 +00:00
|
|
|
QTextBoundaryFinder::BoundaryType type;
|
2016-08-28 17:06:32 +00:00
|
|
|
int pos;
|
|
|
|
QString string;
|
2019-12-14 17:45:40 +00:00
|
|
|
UBreakIterator *breakiter;
|
|
|
|
|
|
|
|
QTextBoundaryFinderPrivate& operator=(const QTextBoundaryFinderPrivate &other);
|
2015-12-10 05:06:13 +02:00
|
|
|
};
|
|
|
|
|
2019-12-14 17:45:40 +00:00
|
|
|
static inline UBreakIteratorType getBreakType(const QTextBoundaryFinder::BoundaryType type)
|
|
|
|
{
|
|
|
|
switch (type) {
|
|
|
|
case QTextBoundaryFinder::Grapheme:
|
|
|
|
return UBRK_CHARACTER;
|
|
|
|
case QTextBoundaryFinder::Word:
|
|
|
|
return UBRK_WORD;
|
|
|
|
case QTextBoundaryFinder::Line:
|
|
|
|
return UBRK_LINE;
|
|
|
|
case QTextBoundaryFinder::Sentence:
|
|
|
|
return UBRK_SENTENCE;
|
|
|
|
}
|
|
|
|
return UBRK_CHARACTER;
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextBoundaryFinderPrivate::QTextBoundaryFinderPrivate()
|
|
|
|
: type(QTextBoundaryFinder::Grapheme), pos(0), breakiter(Q_NULLPTR)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextBoundaryFinderPrivate::QTextBoundaryFinderPrivate(const QTextBoundaryFinderPrivate &other)
|
|
|
|
{
|
|
|
|
QTextBoundaryFinderPrivate::operator=(other);
|
|
|
|
}
|
|
|
|
|
|
|
|
QTextBoundaryFinderPrivate& QTextBoundaryFinderPrivate::operator=(const QTextBoundaryFinderPrivate &other)
|
|
|
|
{
|
|
|
|
type = other.type;
|
|
|
|
pos = other.pos;
|
|
|
|
string = other.string;
|
|
|
|
|
|
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
|
|
breakiter = ubrk_safeClone(other.breakiter, Q_NULLPTR, Q_NULLPTR, &error);
|
|
|
|
if (Q_UNLIKELY(U_FAILURE(error))) {
|
2019-12-14 18:26:55 +00:00
|
|
|
qWarning("QTextBoundaryFinder: ubrk_safeClone() failed %s", u_errorName(error));
|
2019-12-14 17:45:40 +00:00
|
|
|
breakiter = Q_NULLPTR;
|
|
|
|
}
|
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
2015-12-10 05:06:13 +02:00
|
|
|
/*!
|
|
|
|
\class QTextBoundaryFinder
|
|
|
|
|
|
|
|
\brief The QTextBoundaryFinder class provides a way of finding Unicode text boundaries in a string.
|
|
|
|
|
|
|
|
\since 4.4
|
|
|
|
\ingroup tools
|
|
|
|
\ingroup shared
|
|
|
|
\ingroup string-processing
|
|
|
|
\reentrant
|
|
|
|
|
|
|
|
QTextBoundaryFinder allows to find Unicode text boundaries in a
|
|
|
|
string, similar to the Unicode text boundary specification (see
|
|
|
|
http://www.unicode.org/reports/tr29/tr29-11.html).
|
|
|
|
|
|
|
|
QTextBoundaryFinder can operate on a QString in four possible
|
|
|
|
modes depending on the value of \a BoundaryType.
|
|
|
|
|
|
|
|
Units of Unicode characters that make up what the user thinks of
|
|
|
|
as a character or basic unit of the language are here called
|
|
|
|
Grapheme clusters. The two unicode characters 'A' + diaeresis do
|
|
|
|
for example form one grapheme cluster as the user thinks of them
|
|
|
|
as one character, yet it is in this case represented by two
|
getting QTextBoundaryFinder to behave like I want it (theoretically)
Grapheme - all-in-one, everything that is not a letter or a number - marks,
punctuations, space, etc.
Word - stop at space, illogical but right because when one wants to word-
wrap (which this is mostly used for I assume) punctuation should be on the
same row which means that a split must happen after the punctuation, e.g.
"hey,<split> joe!"
Sentence - that's very questionable, usually a punctuation (e.g. comma) can
be used to split a sentence into few sentences but that does not mean that
the sentence boundary was found (e.g. full stop, question mark, etc.) in
the current implementation
Line - obviously just line ending, that would be \n for UNIX (x000A in
Unicode) and whatever else someone comes up into his own "standard".
In any case the whole text-boundary finder class looks wrong for any use
case to me, while I was working on KHTML I found that it's much faster and
more reliable to just test if the UChar/QChar is space (via isSpace()) or
whatever the case needs rather then use the finder. I do not want to botch
it out of the toolkit yet but I suppose a class that looks for a QChar
category(ies) would be far more usefull then a boundry type specific to the
boundry finder with assumptions that are not obvious at first glance.
Signed-off-by: Ivailo Monev <xakepa10@laimg.moc>
2016-08-29 05:04:31 +00:00
|
|
|
unicode code points.
|
2015-12-10 05:06:13 +02:00
|
|
|
|
|
|
|
Word boundaries are there to locate the start and end of what a
|
|
|
|
language considers to be a word.
|
|
|
|
|
|
|
|
Line break boundaries give possible places where a line break
|
|
|
|
might happen and sentence boundaries will show the beginning and
|
|
|
|
end of whole sentences.
|
|
|
|
|
|
|
|
The first position in a string is always a valid boundary and
|
|
|
|
refers to the position before the first character. The last
|
|
|
|
position at the length of the string is also valid and refers
|
|
|
|
to the position after the last character.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*!
|
|
|
|
\enum QTextBoundaryFinder::BoundaryType
|
|
|
|
|
|
|
|
\value Grapheme Finds a grapheme which is the smallest boundary. It
|
|
|
|
including letters, punctation marks, numerals and more.
|
|
|
|
\value Word Finds a word.
|
|
|
|
\value Line Finds possible positions for breaking the text into multiple
|
|
|
|
lines.
|
|
|
|
\value Sentence Finds sentence boundaries. These include periods, question
|
|
|
|
marks etc.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*!
|
|
|
|
\enum QTextBoundaryFinder::BoundaryReason
|
|
|
|
|
|
|
|
\value NotAtBoundary The boundary finder is not at a boundary position.
|
|
|
|
\value StartWord The boundary finder is at the start of a word.
|
|
|
|
\value EndWord The boundary finder is at the end of a word.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Constructs an invalid QTextBoundaryFinder object.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder::QTextBoundaryFinder()
|
2019-12-14 17:45:40 +00:00
|
|
|
: d(new QTextBoundaryFinderPrivate())
|
2015-12-10 05:06:13 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Copies the QTextBoundaryFinder object, \a other.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder::QTextBoundaryFinder(const QTextBoundaryFinder &other)
|
getting QTextBoundaryFinder to behave like I want it (theoretically)
Grapheme - all-in-one, everything that is not a letter or a number - marks,
punctuations, space, etc.
Word - stop at space, illogical but right because when one wants to word-
wrap (which this is mostly used for I assume) punctuation should be on the
same row which means that a split must happen after the punctuation, e.g.
"hey,<split> joe!"
Sentence - that's very questionable, usually a punctuation (e.g. comma) can
be used to split a sentence into few sentences but that does not mean that
the sentence boundary was found (e.g. full stop, question mark, etc.) in
the current implementation
Line - obviously just line ending, that would be \n for UNIX (x000A in
Unicode) and whatever else someone comes up into his own "standard".
In any case the whole text-boundary finder class looks wrong for any use
case to me, while I was working on KHTML I found that it's much faster and
more reliable to just test if the UChar/QChar is space (via isSpace()) or
whatever the case needs rather then use the finder. I do not want to botch
it out of the toolkit yet but I suppose a class that looks for a QChar
category(ies) would be far more usefull then a boundry type specific to the
boundry finder with assumptions that are not obvious at first glance.
Signed-off-by: Ivailo Monev <xakepa10@laimg.moc>
2016-08-29 05:04:31 +00:00
|
|
|
: d(other.d)
|
2015-12-10 05:06:13 +02:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Assigns the object, \a other, to another QTextBoundaryFinder object.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder &QTextBoundaryFinder::operator=(const QTextBoundaryFinder &other)
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
d = other.d;
|
2015-12-10 05:06:13 +02:00
|
|
|
return *this;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Destructs the QTextBoundaryFinder object.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder::~QTextBoundaryFinder()
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
if (d->breakiter) {
|
|
|
|
ubrk_close(d->breakiter);
|
|
|
|
}
|
2016-08-28 17:06:32 +00:00
|
|
|
delete d;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Creates a QTextBoundaryFinder object of \a type operating on \a string.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QString &string)
|
2019-12-14 17:45:40 +00:00
|
|
|
: d(new QTextBoundaryFinderPrivate())
|
2015-12-10 05:06:13 +02:00
|
|
|
{
|
2017-08-07 17:15:37 +00:00
|
|
|
d->type = type;
|
2019-07-09 22:44:57 +00:00
|
|
|
d->string = string;
|
2019-12-14 17:45:40 +00:00
|
|
|
|
|
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
|
|
d->breakiter = ubrk_open(getBreakType(type), "C",
|
|
|
|
reinterpret_cast<const UChar*>(string.unicode()), string.size(), &error);
|
|
|
|
if (Q_UNLIKELY(U_FAILURE(error))) {
|
|
|
|
qWarning("QTextBoundaryFinder::QTextBoundaryFinder: ubrk_open() failed %s", u_errorName(error));
|
|
|
|
d->breakiter = Q_NULLPTR;
|
|
|
|
}
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Creates a QTextBoundaryFinder object of \a type operating on \a chars
|
|
|
|
with \a length.
|
|
|
|
*/
|
2016-09-22 22:01:06 +00:00
|
|
|
QTextBoundaryFinder::QTextBoundaryFinder(BoundaryType type, const QChar *chars, const int length)
|
2019-12-14 17:45:40 +00:00
|
|
|
: d(new QTextBoundaryFinderPrivate())
|
2015-12-10 05:06:13 +02:00
|
|
|
{
|
2017-08-07 17:15:37 +00:00
|
|
|
d->type = type;
|
getting QTextBoundaryFinder to behave like I want it (theoretically)
Grapheme - all-in-one, everything that is not a letter or a number - marks,
punctuations, space, etc.
Word - stop at space, illogical but right because when one wants to word-
wrap (which this is mostly used for I assume) punctuation should be on the
same row which means that a split must happen after the punctuation, e.g.
"hey,<split> joe!"
Sentence - that's very questionable, usually a punctuation (e.g. comma) can
be used to split a sentence into few sentences but that does not mean that
the sentence boundary was found (e.g. full stop, question mark, etc.) in
the current implementation
Line - obviously just line ending, that would be \n for UNIX (x000A in
Unicode) and whatever else someone comes up into his own "standard".
In any case the whole text-boundary finder class looks wrong for any use
case to me, while I was working on KHTML I found that it's much faster and
more reliable to just test if the UChar/QChar is space (via isSpace()) or
whatever the case needs rather then use the finder. I do not want to botch
it out of the toolkit yet but I suppose a class that looks for a QChar
category(ies) would be far more usefull then a boundry type specific to the
boundry finder with assumptions that are not obvious at first glance.
Signed-off-by: Ivailo Monev <xakepa10@laimg.moc>
2016-08-29 05:04:31 +00:00
|
|
|
d->string = QString::fromRawData(chars, length);
|
2019-12-14 17:45:40 +00:00
|
|
|
|
|
|
|
UErrorCode error = U_ZERO_ERROR;
|
|
|
|
d->breakiter = ubrk_open(getBreakType(type), "C",
|
|
|
|
reinterpret_cast<const UChar*>(d->string.unicode()), d->string.size(), &error);
|
|
|
|
if (Q_UNLIKELY(U_FAILURE(error))) {
|
|
|
|
qWarning("QTextBoundaryFinder::QTextBoundaryFinder: ubrk_open() failed %s", u_errorName(error));
|
|
|
|
d->breakiter = Q_NULLPTR;
|
|
|
|
}
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Moves the finder to the start of the string. This is equivalent to setPosition(0).
|
|
|
|
|
|
|
|
\sa setPosition(), position()
|
|
|
|
*/
|
|
|
|
void QTextBoundaryFinder::toStart()
|
|
|
|
{
|
2016-08-28 17:06:32 +00:00
|
|
|
d->pos = 0;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Moves the finder to the end of the string. This is equivalent to setPosition(string.length()).
|
|
|
|
|
|
|
|
\sa setPosition(), position()
|
|
|
|
*/
|
|
|
|
void QTextBoundaryFinder::toEnd()
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
d->pos = d->string.size();
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Returns the current position of the QTextBoundaryFinder.
|
|
|
|
|
|
|
|
The range is from 0 (the beginning of the string) to the length of
|
|
|
|
the string inclusive.
|
|
|
|
|
|
|
|
\sa setPosition()
|
|
|
|
*/
|
|
|
|
int QTextBoundaryFinder::position() const
|
|
|
|
{
|
2016-08-28 17:06:32 +00:00
|
|
|
return d->pos;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Sets the current position of the QTextBoundaryFinder to \a position.
|
|
|
|
|
|
|
|
If \a position is out of bounds, it will be bound to only valid
|
|
|
|
positions. In this case, valid positions are from 0 to the length of
|
|
|
|
the string inclusive.
|
|
|
|
|
|
|
|
\sa position()
|
|
|
|
*/
|
2016-09-22 22:01:06 +00:00
|
|
|
void QTextBoundaryFinder::setPosition(const int position)
|
2015-12-10 05:06:13 +02:00
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
d->pos = qBound(0, position, d->string.size());
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*! \fn QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
|
|
|
|
|
|
|
|
Returns the type of the QTextBoundaryFinder.
|
|
|
|
*/
|
2017-08-07 17:15:37 +00:00
|
|
|
QTextBoundaryFinder::BoundaryType QTextBoundaryFinder::type() const
|
|
|
|
{
|
|
|
|
return d->type;
|
|
|
|
}
|
2015-12-10 05:06:13 +02:00
|
|
|
|
|
|
|
/*! \fn bool QTextBoundaryFinder::isValid() const
|
|
|
|
|
|
|
|
Returns true if the text boundary finder is valid; otherwise returns false.
|
|
|
|
A default QTextBoundaryFinder is invalid.
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*!
|
2016-09-22 22:01:06 +00:00
|
|
|
Returns the string the QTextBoundaryFinder object operates on.
|
2015-12-10 05:06:13 +02:00
|
|
|
*/
|
|
|
|
QString QTextBoundaryFinder::string() const
|
|
|
|
{
|
2019-07-09 22:44:57 +00:00
|
|
|
return d->string;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Moves the QTextBoundaryFinder to the next boundary position and returns that position.
|
|
|
|
|
|
|
|
Returns -1 if there is no next boundary.
|
|
|
|
*/
|
|
|
|
int QTextBoundaryFinder::toNextBoundary()
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
if (!d->breakiter)
|
|
|
|
return -1;
|
|
|
|
if (d->pos != -1)
|
|
|
|
ubrk_following(d->breakiter, d->pos - 1);
|
|
|
|
d->pos = ubrk_next(d->breakiter);
|
2016-08-28 17:06:32 +00:00
|
|
|
return d->pos;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Moves the QTextBoundaryFinder to the previous boundary position and returns that position.
|
|
|
|
|
|
|
|
Returns -1 if there is no previous boundary.
|
|
|
|
*/
|
|
|
|
int QTextBoundaryFinder::toPreviousBoundary()
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
if (!d->breakiter)
|
|
|
|
return -1;
|
|
|
|
if (d->pos != -1)
|
|
|
|
ubrk_preceding(d->breakiter, d->pos + 1);
|
|
|
|
d->pos = ubrk_previous(d->breakiter);
|
2016-08-28 17:06:32 +00:00
|
|
|
return d->pos;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Returns true if the object's position() is currently at a valid text boundary.
|
|
|
|
*/
|
|
|
|
bool QTextBoundaryFinder::isAtBoundary() const
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
if (!d->breakiter)
|
2015-12-10 05:06:13 +02:00
|
|
|
return false;
|
2019-12-14 17:45:40 +00:00
|
|
|
return ubrk_isBoundary(d->breakiter, d->pos);
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
/*!
|
|
|
|
Returns the reasons for the boundary finder to have chosen the current position as a boundary.
|
|
|
|
*/
|
|
|
|
QTextBoundaryFinder::BoundaryReasons QTextBoundaryFinder::boundaryReasons() const
|
|
|
|
{
|
2019-12-14 17:45:40 +00:00
|
|
|
if (!isAtBoundary()) {
|
2016-08-28 17:06:32 +00:00
|
|
|
return QTextBoundaryFinder::NotAtBoundary;
|
2019-12-14 17:45:40 +00:00
|
|
|
} else if (d->pos == 0) {
|
2016-08-28 17:06:32 +00:00
|
|
|
return QTextBoundaryFinder::StartWord;
|
2019-12-14 17:45:40 +00:00
|
|
|
} else if (d->pos == d->string.size()) {
|
2016-08-28 17:06:32 +00:00
|
|
|
return QTextBoundaryFinder::EndWord;
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
2019-12-14 17:45:40 +00:00
|
|
|
uint reasons;
|
|
|
|
if (ubrk_isBoundary(d->breakiter, d->pos - 1)) {
|
|
|
|
reasons |= QTextBoundaryFinder::StartWord;
|
|
|
|
}
|
|
|
|
if (ubrk_isBoundary(d->breakiter, d->pos + 1)) {
|
|
|
|
reasons |= QTextBoundaryFinder::EndWord;
|
|
|
|
}
|
|
|
|
return QTextBoundaryFinder::BoundaryReason(reasons);
|
2015-12-10 05:06:13 +02:00
|
|
|
}
|
|
|
|
|
|
|
|
QT_END_NAMESPACE
|