kdecore: optimize KMimeType::findByContent()

read once, match until match is found or otherwise. that means less disk
I/O

Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
This commit is contained in:
Ivailo Monev 2024-05-18 05:02:20 +03:00
parent ddbcca439d
commit 969f3e428e
6 changed files with 31 additions and 69 deletions

View file

@ -41,12 +41,12 @@
*
*/
static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QList<KMimeMagicMatch>& matches, const QString& mimeType)
static bool testMatches(const QByteArray &availableData, const QList<KMimeMagicMatch>& matches, const QString& mimeType)
{
for ( QList<KMimeMagicMatch>::const_iterator it = matches.begin(), end = matches.end() ;
it != end ; ++it ) {
const KMimeMagicMatch& match = *it;
if (match.match(device, deviceSize, availableData, mimeType)) {
if (match.match(availableData, mimeType)) {
// One of the hierarchies matched -> mimetype recognized.
return true;
}
@ -54,53 +54,29 @@ static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availa
return false;
}
bool KMimeMagicRule::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const
bool KMimeMagicRule::match(const QByteArray &availableData) const
{
return testMatches(device, deviceSize, availableData, m_matches, m_mimetype);
return testMatches(availableData, m_matches, m_mimetype);
}
bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const
bool KMimeMagicMatch::match(const QByteArray &availableData, const QString& mimeType) const
{
// First, check that "this" matches, then we'll dive into subMatches if any.
const qint64 mDataSize = m_data.size();
if (m_rangeStart + mDataSize > deviceSize)
if (m_rangeStart + mDataSize > availableData.size())
return false; // file is too small
// Read in one block all the data we'll need
// Example: m_data="ABC", m_rangeLength=3 -> we need 3+3-1=5 bytes (ABCxx,xABCx,xxABC would match)
const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, deviceSize - m_rangeStart);
QByteArray readData;
/*kDebug() << "need " << dataNeeded << " bytes of data starting at " << m_rangeStart
<< " - availableData has " << availableData.size() << " bytes,"
<< " device has " << deviceSize << " bytes.";*/
if (m_rangeStart + dataNeeded > availableData.size() && availableData.size() < deviceSize) {
// Need to read from device
if (!device->seek(m_rangeStart))
return false;
readData.resize(dataNeeded);
const int nread = device->read(readData.data(), dataNeeded);
//kDebug() << "readData (from device): reading" << dataNeeded << "bytes.";
if (nread < mDataSize)
return false; // error (or not enough data but we checked for that already)
if (m_rangeStart == 0 && readData.size() > availableData.size()) {
availableData = readData; // update cache
}
if (nread < readData.size()) {
// File big enough to contain m_data, but not big enough for the full rangeLength.
// Pad with zeros.
memset(readData.data() + nread, 0, dataNeeded - nread);
}
//kDebug() << "readData (from device) at pos " << m_rangeStart << ":" << readData;
} else {
readData = QByteArray::fromRawData(availableData.constData() + m_rangeStart,
dataNeeded);
const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, availableData.size() - m_rangeStart);
QByteArray readData = QByteArray::fromRawData(
availableData.constData() + m_rangeStart,
dataNeeded
);
// Warning, readData isn't null-terminated so this kDebug
// gives valgrind warnings (when printing as char* data).
//kDebug() << "readData (from availableData) at pos " << m_rangeStart << ":" << readData;
}
// All we need to do now, is to look for m_data in readData (whose size is dataNeeded).
// Either as a simple indexOf search, or applying the mask.
@ -141,5 +117,5 @@ bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& av
return true;
// Check that one of the submatches matches too
return testMatches(device, deviceSize, availableData, m_subMatches, mimeType);
return testMatches(availableData, m_subMatches, mimeType);
}

View file

@ -31,7 +31,7 @@
*/
struct KMimeMagicMatch
{
bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const;
bool match(const QByteArray &availableData, const QString& mimeType) const;
qint64 m_rangeStart;
qint64 m_rangeLength;
@ -57,7 +57,7 @@ public:
KMimeMagicRule(const QString &mimetype, int priority, const QList<KMimeMagicMatch> &matches)
: m_mimetype(mimetype), m_priority(priority), m_matches(matches) {}
bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const;
bool match(const QByteArray &availableData) const;
QString mimetype() const { return m_mimetype; }
int priority() const { return m_priority; }

View file

@ -223,7 +223,8 @@ KMimeType::Ptr KMimeType::findByUrl(const KUrl &url, mode_t mode,
QFile file(localfile);
if (file.open(QIODevice::ReadOnly)) {
int magicAccuracy = 0;
KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(&file, &magicAccuracy);
// provide enough data for most rules (there are exceptions which require twice as much tho)
KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(file.read(16384), &magicAccuracy);
// mime can't be 0, except in case of install problems.
// However we get magicAccuracy==0 for octet-stream, i.e. no magic match found.
// kDebug() << "findFromContent said" << (mime?mime->name():QString()) << "with accuracy" << magicAccuracy;
@ -288,9 +289,7 @@ KMimeType::Ptr KMimeType::findByName(const QString &fileName, int *accuracy)
KMimeType::Ptr KMimeType::findByContent(const QByteArray &data, int *accuracy)
{
KMimeTypeRepository::self()->checkEssentialMimeTypes();
QBuffer buffer(const_cast<QByteArray *>(&data));
buffer.open(QIODevice::ReadOnly);
return KMimeTypeRepository::self()->findFromContent(&buffer, accuracy);
return KMimeTypeRepository::self()->findFromContent(data, accuracy);
}
QString KMimeType::extractKnownExtension(const QString &fileName)

View file

@ -314,26 +314,18 @@ QStringList KMimeTypeRepository::findFromFileName(const QString &fileName, QStri
return matchingMimeTypes;
}
KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accuracy)
KMimeType::Ptr KMimeTypeRepository::findFromContent(const QByteArray &data, int* accuracy)
{
Q_ASSERT(device->isOpen());
const qint64 deviceSize = device->size();
if (deviceSize == 0) {
if (data.size() == 0) {
if (accuracy) {
*accuracy = 100;
}
return findMimeTypeByName(QLatin1String("application/x-zerosize"), KMimeType::DontResolveAlias);
}
// provide enough data for most rules (there are exceptions which require twice as much tho)
const qint64 dataNeeded = qMin(deviceSize, (qint64) 16384);
QByteArray beginning(dataNeeded, '\0');
if (!device->seek(0) || device->read(beginning.data(), dataNeeded) == -1) {
return defaultMimeTypePtr(); // don't bother detecting unreadable file
}
// Apply magic rules
Q_FOREACH ( const KMimeMagicRule& rule, m_magicRules ) {
if (rule.match(device, deviceSize, beginning)) {
if (rule.match( data)) {
if (accuracy) {
*accuracy = rule.priority();
}
@ -343,7 +335,7 @@ KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accu
// Do fallback code so that we never return 0
// Nothing worked, check if the file contents looks like binary or text
if (!KMimeType::isBufferBinaryData(beginning)) {
if (!KMimeType::isBufferBinaryData(data)) {
if (accuracy) {
*accuracy = 5;
}

View file

@ -97,13 +97,13 @@ private: // only for KMimeType and unittests
QStringList findFromFileName(const QString &filename, QString *matchingExtension = nullptr) const;
/**
* Find a mimetype from the content of a file or buffer
* @param device the file or buffer. Must be open.
* Find a mimetype from the content of data chunk
* @param data chunk of data
* @param accuracy returns the priority of the rule that matched
*
* This is internal API, use KMimeType::findByUrl instead.
*/
KMimeType::Ptr findFromContent(QIODevice *device, int *accuracy);
KMimeType::Ptr findFromContent(const QByteArray &data, int *accuracy);
/**
* @return true if at least one mimetype is present

View file

@ -842,20 +842,15 @@ void KMimeTypeTest::testParseMagicFile()
QFETCH(QString, testData);
// kDebug() << QTest::currentDataTag();
QFETCH(QString, expected);
QBuffer testBuffer;
testBuffer.setData(testData.toLatin1());
QVERIFY(testBuffer.open(QIODevice::ReadOnly));
const qint64 testBufferSize = testBuffer.size();
QByteArray testDataBytes = testData.toLatin1();
QString found;
QByteArray beginning;
Q_FOREACH(const KMimeMagicRule& rule, m_rules) {
if (rule.match(&testBuffer, testBufferSize, beginning)) {
if (rule.match(testDataBytes)) {
found = rule.mimetype();
break;
}
}
QCOMPARE(found, expected);
testBuffer.close();
}
void KMimeTypeTest::testHelperProtocols()