kdecore: optimize KMimeType::findByContent()

read once, match until match is found or otherwise. that means less disk
I/O

Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
This commit is contained in:
Ivailo Monev 2024-05-18 05:02:20 +03:00
parent ddbcca439d
commit 969f3e428e
6 changed files with 31 additions and 69 deletions

View file

@ -41,12 +41,12 @@
* *
*/ */
static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QList<KMimeMagicMatch>& matches, const QString& mimeType) static bool testMatches(const QByteArray &availableData, const QList<KMimeMagicMatch>& matches, const QString& mimeType)
{ {
for ( QList<KMimeMagicMatch>::const_iterator it = matches.begin(), end = matches.end() ; for ( QList<KMimeMagicMatch>::const_iterator it = matches.begin(), end = matches.end() ;
it != end ; ++it ) { it != end ; ++it ) {
const KMimeMagicMatch& match = *it; const KMimeMagicMatch& match = *it;
if (match.match(device, deviceSize, availableData, mimeType)) { if (match.match(availableData, mimeType)) {
// One of the hierarchies matched -> mimetype recognized. // One of the hierarchies matched -> mimetype recognized.
return true; return true;
} }
@ -54,53 +54,29 @@ static bool testMatches(QIODevice* device, qint64 deviceSize, QByteArray& availa
return false; return false;
} }
bool KMimeMagicRule::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const bool KMimeMagicRule::match(const QByteArray &availableData) const
{ {
return testMatches(device, deviceSize, availableData, m_matches, m_mimetype); return testMatches(availableData, m_matches, m_mimetype);
} }
bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const bool KMimeMagicMatch::match(const QByteArray &availableData, const QString& mimeType) const
{ {
// First, check that "this" matches, then we'll dive into subMatches if any. // First, check that "this" matches, then we'll dive into subMatches if any.
const qint64 mDataSize = m_data.size(); const qint64 mDataSize = m_data.size();
if (m_rangeStart + mDataSize > deviceSize) if (m_rangeStart + mDataSize > availableData.size())
return false; // file is too small return false; // file is too small
// Read in one block all the data we'll need // Read in one block all the data we'll need
// Example: m_data="ABC", m_rangeLength=3 -> we need 3+3-1=5 bytes (ABCxx,xABCx,xxABC would match) // Example: m_data="ABC", m_rangeLength=3 -> we need 3+3-1=5 bytes (ABCxx,xABCx,xxABC would match)
const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, deviceSize - m_rangeStart); const int dataNeeded = qMin(mDataSize + m_rangeLength - 1, availableData.size() - m_rangeStart);
QByteArray readData; QByteArray readData = QByteArray::fromRawData(
availableData.constData() + m_rangeStart,
/*kDebug() << "need " << dataNeeded << " bytes of data starting at " << m_rangeStart dataNeeded
<< " - availableData has " << availableData.size() << " bytes," );
<< " device has " << deviceSize << " bytes.";*/
if (m_rangeStart + dataNeeded > availableData.size() && availableData.size() < deviceSize) {
// Need to read from device
if (!device->seek(m_rangeStart))
return false;
readData.resize(dataNeeded);
const int nread = device->read(readData.data(), dataNeeded);
//kDebug() << "readData (from device): reading" << dataNeeded << "bytes.";
if (nread < mDataSize)
return false; // error (or not enough data but we checked for that already)
if (m_rangeStart == 0 && readData.size() > availableData.size()) {
availableData = readData; // update cache
}
if (nread < readData.size()) {
// File big enough to contain m_data, but not big enough for the full rangeLength.
// Pad with zeros.
memset(readData.data() + nread, 0, dataNeeded - nread);
}
//kDebug() << "readData (from device) at pos " << m_rangeStart << ":" << readData;
} else {
readData = QByteArray::fromRawData(availableData.constData() + m_rangeStart,
dataNeeded);
// Warning, readData isn't null-terminated so this kDebug // Warning, readData isn't null-terminated so this kDebug
// gives valgrind warnings (when printing as char* data). // gives valgrind warnings (when printing as char* data).
//kDebug() << "readData (from availableData) at pos " << m_rangeStart << ":" << readData; //kDebug() << "readData (from availableData) at pos " << m_rangeStart << ":" << readData;
}
// All we need to do now, is to look for m_data in readData (whose size is dataNeeded). // All we need to do now, is to look for m_data in readData (whose size is dataNeeded).
// Either as a simple indexOf search, or applying the mask. // Either as a simple indexOf search, or applying the mask.
@ -141,5 +117,5 @@ bool KMimeMagicMatch::match(QIODevice* device, qint64 deviceSize, QByteArray& av
return true; return true;
// Check that one of the submatches matches too // Check that one of the submatches matches too
return testMatches(device, deviceSize, availableData, m_subMatches, mimeType); return testMatches(availableData, m_subMatches, mimeType);
} }

View file

@ -31,7 +31,7 @@
*/ */
struct KMimeMagicMatch struct KMimeMagicMatch
{ {
bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData, const QString& mimeType) const; bool match(const QByteArray &availableData, const QString& mimeType) const;
qint64 m_rangeStart; qint64 m_rangeStart;
qint64 m_rangeLength; qint64 m_rangeLength;
@ -57,7 +57,7 @@ public:
KMimeMagicRule(const QString &mimetype, int priority, const QList<KMimeMagicMatch> &matches) KMimeMagicRule(const QString &mimetype, int priority, const QList<KMimeMagicMatch> &matches)
: m_mimetype(mimetype), m_priority(priority), m_matches(matches) {} : m_mimetype(mimetype), m_priority(priority), m_matches(matches) {}
bool match(QIODevice* device, qint64 deviceSize, QByteArray& availableData) const; bool match(const QByteArray &availableData) const;
QString mimetype() const { return m_mimetype; } QString mimetype() const { return m_mimetype; }
int priority() const { return m_priority; } int priority() const { return m_priority; }

View file

@ -223,7 +223,8 @@ KMimeType::Ptr KMimeType::findByUrl(const KUrl &url, mode_t mode,
QFile file(localfile); QFile file(localfile);
if (file.open(QIODevice::ReadOnly)) { if (file.open(QIODevice::ReadOnly)) {
int magicAccuracy = 0; int magicAccuracy = 0;
KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(&file, &magicAccuracy); // provide enough data for most rules (there are exceptions which require twice as much tho)
KMimeType::Ptr mime = KMimeTypeRepository::self()->findFromContent(file.read(16384), &magicAccuracy);
// mime can't be 0, except in case of install problems. // mime can't be 0, except in case of install problems.
// However we get magicAccuracy==0 for octet-stream, i.e. no magic match found. // However we get magicAccuracy==0 for octet-stream, i.e. no magic match found.
// kDebug() << "findFromContent said" << (mime?mime->name():QString()) << "with accuracy" << magicAccuracy; // kDebug() << "findFromContent said" << (mime?mime->name():QString()) << "with accuracy" << magicAccuracy;
@ -288,9 +289,7 @@ KMimeType::Ptr KMimeType::findByName(const QString &fileName, int *accuracy)
KMimeType::Ptr KMimeType::findByContent(const QByteArray &data, int *accuracy) KMimeType::Ptr KMimeType::findByContent(const QByteArray &data, int *accuracy)
{ {
KMimeTypeRepository::self()->checkEssentialMimeTypes(); KMimeTypeRepository::self()->checkEssentialMimeTypes();
QBuffer buffer(const_cast<QByteArray *>(&data)); return KMimeTypeRepository::self()->findFromContent(data, accuracy);
buffer.open(QIODevice::ReadOnly);
return KMimeTypeRepository::self()->findFromContent(&buffer, accuracy);
} }
QString KMimeType::extractKnownExtension(const QString &fileName) QString KMimeType::extractKnownExtension(const QString &fileName)

View file

@ -314,26 +314,18 @@ QStringList KMimeTypeRepository::findFromFileName(const QString &fileName, QStri
return matchingMimeTypes; return matchingMimeTypes;
} }
KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accuracy) KMimeType::Ptr KMimeTypeRepository::findFromContent(const QByteArray &data, int* accuracy)
{ {
Q_ASSERT(device->isOpen()); if (data.size() == 0) {
const qint64 deviceSize = device->size();
if (deviceSize == 0) {
if (accuracy) { if (accuracy) {
*accuracy = 100; *accuracy = 100;
} }
return findMimeTypeByName(QLatin1String("application/x-zerosize"), KMimeType::DontResolveAlias); return findMimeTypeByName(QLatin1String("application/x-zerosize"), KMimeType::DontResolveAlias);
} }
// provide enough data for most rules (there are exceptions which require twice as much tho)
const qint64 dataNeeded = qMin(deviceSize, (qint64) 16384);
QByteArray beginning(dataNeeded, '\0');
if (!device->seek(0) || device->read(beginning.data(), dataNeeded) == -1) {
return defaultMimeTypePtr(); // don't bother detecting unreadable file
}
// Apply magic rules // Apply magic rules
Q_FOREACH ( const KMimeMagicRule& rule, m_magicRules ) { Q_FOREACH ( const KMimeMagicRule& rule, m_magicRules ) {
if (rule.match(device, deviceSize, beginning)) { if (rule.match( data)) {
if (accuracy) { if (accuracy) {
*accuracy = rule.priority(); *accuracy = rule.priority();
} }
@ -343,7 +335,7 @@ KMimeType::Ptr KMimeTypeRepository::findFromContent(QIODevice* device, int* accu
// Do fallback code so that we never return 0 // Do fallback code so that we never return 0
// Nothing worked, check if the file contents looks like binary or text // Nothing worked, check if the file contents looks like binary or text
if (!KMimeType::isBufferBinaryData(beginning)) { if (!KMimeType::isBufferBinaryData(data)) {
if (accuracy) { if (accuracy) {
*accuracy = 5; *accuracy = 5;
} }

View file

@ -97,13 +97,13 @@ private: // only for KMimeType and unittests
QStringList findFromFileName(const QString &filename, QString *matchingExtension = nullptr) const; QStringList findFromFileName(const QString &filename, QString *matchingExtension = nullptr) const;
/** /**
* Find a mimetype from the content of a file or buffer * Find a mimetype from the content of data chunk
* @param device the file or buffer. Must be open. * @param data chunk of data
* @param accuracy returns the priority of the rule that matched * @param accuracy returns the priority of the rule that matched
* *
* This is internal API, use KMimeType::findByUrl instead. * This is internal API, use KMimeType::findByUrl instead.
*/ */
KMimeType::Ptr findFromContent(QIODevice *device, int *accuracy); KMimeType::Ptr findFromContent(const QByteArray &data, int *accuracy);
/** /**
* @return true if at least one mimetype is present * @return true if at least one mimetype is present

View file

@ -842,20 +842,15 @@ void KMimeTypeTest::testParseMagicFile()
QFETCH(QString, testData); QFETCH(QString, testData);
// kDebug() << QTest::currentDataTag(); // kDebug() << QTest::currentDataTag();
QFETCH(QString, expected); QFETCH(QString, expected);
QBuffer testBuffer; QByteArray testDataBytes = testData.toLatin1();
testBuffer.setData(testData.toLatin1());
QVERIFY(testBuffer.open(QIODevice::ReadOnly));
const qint64 testBufferSize = testBuffer.size();
QString found; QString found;
QByteArray beginning;
Q_FOREACH(const KMimeMagicRule& rule, m_rules) { Q_FOREACH(const KMimeMagicRule& rule, m_rules) {
if (rule.match(&testBuffer, testBufferSize, beginning)) { if (rule.match(testDataBytes)) {
found = rule.mimetype(); found = rule.mimetype();
break; break;
} }
} }
QCOMPARE(found, expected); QCOMPARE(found, expected);
testBuffer.close();
} }
void KMimeTypeTest::testHelperProtocols() void KMimeTypeTest::testHelperProtocols()