From 969f3e428e36ac6c4f4dcc47acd2ad18a91f841d Mon Sep 17 00:00:00 2001 From: Ivailo Monev Date: Sat, 18 May 2024 05:02:20 +0300 Subject: [PATCH] kdecore: optimize KMimeType::findByContent() read once, match until match is found or otherwise. that means less disk I/O Signed-off-by: Ivailo Monev --- kdecore/services/kmimemagicrule.cpp | 54 +++++++----------------- kdecore/services/kmimemagicrule_p.h | 6 +-- kdecore/services/kmimetype.cpp | 7 ++- kdecore/services/kmimetyperepository.cpp | 16 ++----- kdecore/services/kmimetyperepository_p.h | 6 +-- kdecore/tests/kmimetypetest.cpp | 11 ++--- 6 files changed, 31 insertions(+), 69 deletions(-) diff --git a/kdecore/services/kmimemagicrule.cpp b/kdecore/services/kmimemagicrule.cpp index d559286b..d726b308 100644 --- a/kdecore/services/kmimemagicrule.cpp +++ b/kdecore/services/kmimemagicrule.cpp @@ -41,12 +41,12 @@ * */ -static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QList& matches, const QString& mimeType) +static bool testMatches(const QByteArray &availableData, const QList& matches, const QString& mimeType) { for ( QList::const_iterator it = matches.begin(), end = matches.end() ; it != end ; ++it ) { const KMimeMagicMatch& match = *it; - if (match.match(device, deviceSize, availableData, mimeType)) { + if (match.match(availableData, mimeType)) { // One of the hierarchies matched -> mimetype recognized. return true; } @@ -54,53 +54,29 @@ static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availa return false; } -bool KMimeMagicRule::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const +bool KMimeMagicRule::match(const QByteArray &availableData) const { - return testMatches(device, deviceSize, availableData, m_matches, m_mimetype); + return testMatches(availableData, m_matches, m_mimetype); } -bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const +bool KMimeMagicMatch::match(const QByteArray &availableData, const QString& mimeType) const { // First, check that "this" matches, then we'll dive into subMatches if any. const qint64 mDataSize = m_data.size(); - if (m_rangeStart + mDataSize > deviceSize) + if (m_rangeStart + mDataSize > availableData.size()) return false; // file is too small // Read in one block all the data we'll need // Example: m_data="ABC", m_rangeLength=3 -> we need 3+3-1=5 bytes (ABCxx,xABCx,xxABC would match) - const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, deviceSize - m_rangeStart); - QByteArray readData; - - /*kDebug() << "need " << dataNeeded << " bytes of data starting at " << m_rangeStart - << " - availableData has " << availableData.size() << " bytes," - << " device has " << deviceSize << " bytes.";*/ - - if (m_rangeStart + dataNeeded > availableData.size() && availableData.size() < deviceSize) { - // Need to read from device - if (!device->seek(m_rangeStart)) - return false; - readData.resize(dataNeeded); - const int nread = device->read(readData.data(), dataNeeded); - //kDebug() << "readData (from device): reading" << dataNeeded << "bytes."; - if (nread < mDataSize) - return false; // error (or not enough data but we checked for that already) - if (m_rangeStart == 0 && readData.size() > availableData.size()) { - availableData = readData; // update cache - } - if (nread < readData.size()) { - // File big enough to contain m_data, but not big enough for the full rangeLength. - // Pad with zeros. - memset(readData.data() + nread, 0, dataNeeded - nread); - } - //kDebug() << "readData (from device) at pos " << m_rangeStart << ":" << readData; - } else { - readData = QByteArray::fromRawData(availableData.constData() + m_rangeStart, - dataNeeded); - // Warning, readData isn't null-terminated so this kDebug - // gives valgrind warnings (when printing as char* data). - //kDebug() << "readData (from availableData) at pos " << m_rangeStart << ":" << readData; - } + const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, availableData.size() - m_rangeStart); + QByteArray readData = QByteArray::fromRawData( + availableData.constData() + m_rangeStart, + dataNeeded + ); + // Warning, readData isn't null-terminated so this kDebug + // gives valgrind warnings (when printing as char* data). + //kDebug() << "readData (from availableData) at pos " << m_rangeStart << ":" << readData; // All we need to do now, is to look for m_data in readData (whose size is dataNeeded). // Either as a simple indexOf search, or applying the mask. @@ -141,5 +117,5 @@ bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& av return true; // Check that one of the submatches matches too - return testMatches(device, deviceSize, availableData, m_subMatches, mimeType); + return testMatches(availableData, m_subMatches, mimeType); } diff --git a/kdecore/services/kmimemagicrule_p.h b/kdecore/services/kmimemagicrule_p.h index 93559530..46272998 100644 --- a/kdecore/services/kmimemagicrule_p.h +++ b/kdecore/services/kmimemagicrule_p.h @@ -31,7 +31,7 @@ */ struct KMimeMagicMatch { - bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const; + bool match(const QByteArray &availableData, const QString& mimeType) const; qint64 m_rangeStart; qint64 m_rangeLength; @@ -54,10 +54,10 @@ struct KMimeMagicMatch class KMimeMagicRule { public: - KMimeMagicRule(const QString& mimetype, int priority, const QList& matches) + KMimeMagicRule(const QString &mimetype, int priority, const QList &matches) : m_mimetype(mimetype), m_priority(priority), m_matches(matches) {} - bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const; + bool match(const QByteArray &availableData) const; QString mimetype() const { return m_mimetype; } int priority() const { return m_priority; } diff --git a/kdecore/services/kmimetype.cpp b/kdecore/services/kmimetype.cpp index 5bc6e02a..c97931cc 100644 --- a/kdecore/services/kmimetype.cpp +++ b/kdecore/services/kmimetype.cpp @@ -223,7 +223,8 @@ KMimeType::Ptr KMimeType::findByUrl(const KUrl &url, mode_t mode, QFile file(localfile); if (file.open(QIODevice::ReadOnly)) { int magicAccuracy = 0; - KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(&file, &magicAccuracy); + // provide enough data for most rules (there are exceptions which require twice as much tho) + KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(file.read(16384), &magicAccuracy); // mime can't be 0, except in case of install problems. // However we get magicAccuracy==0 for octet-stream, i.e. no magic match found. // kDebug() << "findFromContent said" << (mime?mime->name():QString()) << "with accuracy" << magicAccuracy; @@ -288,9 +289,7 @@ KMimeType::Ptr KMimeType::findByName(const QString &fileName, int *accuracy) KMimeType::Ptr KMimeType::findByContent(const QByteArray &data, int *accuracy) { KMimeTypeRepository::self()->checkEssentialMimeTypes(); - QBuffer buffer(const_cast(&data)); - buffer.open(QIODevice::ReadOnly); - return KMimeTypeRepository::self()->findFromContent(&buffer, accuracy); + return KMimeTypeRepository::self()->findFromContent(data, accuracy); } QString KMimeType::extractKnownExtension(const QString &fileName) diff --git a/kdecore/services/kmimetyperepository.cpp b/kdecore/services/kmimetyperepository.cpp index 7b5cce91..f7bf1a25 100644 --- a/kdecore/services/kmimetyperepository.cpp +++ b/kdecore/services/kmimetyperepository.cpp @@ -314,26 +314,18 @@ QStringList KMimeTypeRepository::findFromFileName(const QString &fileName, QStri return matchingMimeTypes; } -KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accuracy) +KMimeType::Ptr KMimeTypeRepository::findFromContent(const QByteArray &data, int* accuracy) { - Q_ASSERT(device->isOpen()); - const qint64 deviceSize = device->size(); - if (deviceSize == 0) { + if (data.size() == 0) { if (accuracy) { *accuracy = 100; } return findMimeTypeByName(QLatin1String("application/x-zerosize"), KMimeType::DontResolveAlias); } - // provide enough data for most rules (there are exceptions which require twice as much tho) - const qint64 dataNeeded = qMin(deviceSize, (qint64) 16384); - QByteArray beginning(dataNeeded, '\0'); - if (!device->seek(0) || device->read(beginning.data(), dataNeeded) == -1) { - return defaultMimeTypePtr(); // don't bother detecting unreadable file - } // Apply magic rules Q_FOREACH ( const KMimeMagicRule& rule, m_magicRules ) { - if (rule.match(device, deviceSize, beginning)) { + if (rule.match( data)) { if (accuracy) { *accuracy = rule.priority(); } @@ -343,7 +335,7 @@ KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accu // Do fallback code so that we never return 0 // Nothing worked, check if the file contents looks like binary or text - if (!KMimeType::isBufferBinaryData(beginning)) { + if (!KMimeType::isBufferBinaryData(data)) { if (accuracy) { *accuracy = 5; } diff --git a/kdecore/services/kmimetyperepository_p.h b/kdecore/services/kmimetyperepository_p.h index bad1ee8a..1b2819b8 100644 --- a/kdecore/services/kmimetyperepository_p.h +++ b/kdecore/services/kmimetyperepository_p.h @@ -97,13 +97,13 @@ private: // only for KMimeType and unittests QStringList findFromFileName(const QString &filename, QString *matchingExtension = nullptr) const; /** - * Find a mimetype from the content of a file or buffer - * @param device the file or buffer. Must be open. + * Find a mimetype from the content of data chunk + * @param data chunk of data * @param accuracy returns the priority of the rule that matched * * This is internal API, use KMimeType::findByUrl instead. */ - KMimeType::Ptr findFromContent(QIODevice *device, int *accuracy); + KMimeType::Ptr findFromContent(const QByteArray &data, int *accuracy); /** * @return true if at least one mimetype is present diff --git a/kdecore/tests/kmimetypetest.cpp b/kdecore/tests/kmimetypetest.cpp index 34c6999b..74d5e168 100644 --- a/kdecore/tests/kmimetypetest.cpp +++ b/kdecore/tests/kmimetypetest.cpp @@ -840,22 +840,17 @@ void KMimeTypeTest::testParseMagicFile_data() void KMimeTypeTest::testParseMagicFile() { QFETCH(QString, testData); - //kDebug() << QTest::currentDataTag(); + // kDebug() << QTest::currentDataTag(); QFETCH(QString, expected); - QBuffer testBuffer; - testBuffer.setData(testData.toLatin1()); - QVERIFY(testBuffer.open(QIODevice::ReadOnly)); - const qint64 testBufferSize = testBuffer.size(); + QByteArray testDataBytes = testData.toLatin1(); QString found; - QByteArray beginning; Q_FOREACH(const KMimeMagicRule& rule, m_rules) { - if (rule.match(&testBuffer, testBufferSize, beginning)) { + if (rule.match(testDataBytes)) { found = rule.mimetype(); break; } } QCOMPARE(found, expected); - testBuffer.close(); } void KMimeTypeTest::testHelperProtocols()