kdecore: MIME glob matching optimization

by doing it with a single iteration over the globs a lot less processing
has to be done

Signed-off-by: Ivailo Monev <xakepa10@gmail.com>
This commit is contained in:
Ivailo Monev 2024-05-05 00:11:11 +03:00
parent d338e13b8d
commit 1ff6f4d2ee
5 changed files with 67 additions and 156 deletions

View file

@ -18,25 +18,23 @@
*/
#include "kmimeglobsfileparser_p.h"
#include <kglobal.h>
#include <kdeversion.h>
#include <kmimetype.h>
#include <kstandarddirs.h>
#include "kglobal.h"
#include "kdeversion.h"
#include "kmimetype.h"
#include "kstandarddirs.h"
#include "kmimetyperepository_p.h"
#include <kdebug.h>
#include <QtCore/QTextStream>
#include <QtCore/QFile>
#include "kdebug.h"
KMimeGlobsFileParser::AllGlobs KMimeGlobsFileParser::parseGlobs()
#include <QFile>
static bool kGlobSort(const KMimeGlobsFileParser::Glob &first, const KMimeGlobsFileParser::Glob &second)
{
const QStringList globFiles = KGlobal::dirs()->findAllResources("xdgdata-mime", QString::fromLatin1("globs2"));
//kDebug() << globFiles;
return parseGlobs(globFiles);
return (first.weight >= second.weight);
}
KMimeGlobsFileParser::AllGlobs KMimeGlobsFileParser::parseGlobs(const QStringList &globFiles)
KMimeGlobsFileParser::GlobList KMimeGlobsFileParser::parseGlobs(const QStringList &globFiles)
{
KMimeGlobsFileParser::AllGlobs allGlobs;
KMimeGlobsFileParser::GlobList allGlobs;
QListIterator<QString> globIter(globFiles);
globIter.toBack();
// At each level, we must be able to override (not just add to) the information that we read at higher levels
@ -47,6 +45,10 @@ KMimeGlobsFileParser::AllGlobs KMimeGlobsFileParser::parseGlobs(const QStringLis
//kDebug() << "Now parsing" << fileName;
parseGlobFile(&globFile, allGlobs);
}
// glob2 files are weight-sorted, manually sort only when more than one file is parsed
if (globFiles.size() > 1) {
qStableSort(allGlobs.begin(), allGlobs.end(), kGlobSort);
}
return allGlobs;
}
@ -63,7 +65,7 @@ static void filterEmptyFromList(QList<QByteArray>* bytelist)
}
// uses a QIODevice to make unit tests possible
bool KMimeGlobsFileParser::parseGlobFile(QIODevice* file, AllGlobs& globs)
bool KMimeGlobsFileParser::parseGlobFile(QIODevice* file, GlobList& globs)
{
Q_ASSERT(file);
if (!file->open(QIODevice::ReadOnly)) {
@ -106,8 +108,6 @@ bool KMimeGlobsFileParser::parseGlobFile(QIODevice* file, AllGlobs& globs)
continue;
}
bool caseSensitive = flagList.contains(QByteArray("cs"));
const QString mimeTypeNameStr = QString::fromLatin1(mimeTypeName.constData(), mimeTypeName.size());
if (pattern == "__NOGLOBS__") {
// kDebug() << "removing" << mimeTypeName;
@ -118,62 +118,22 @@ bool KMimeGlobsFileParser::parseGlobFile(QIODevice* file, AllGlobs& globs)
// kDebug() << "Adding pattern" << pattern << "to mimetype" << mimeTypeName << "from globs file, with weight" << weight;
//if (pattern.toLower() == "*.c")
// kDebug() << " Adding pattern" << pattern << "to mimetype" << mimeTypeName << "from globs file, with weight" << weight << "flags" << flags;
const QString patternStr = QString::fromLatin1(pattern.constData(), pattern.size());
globs.addGlob(Glob(mimeTypeNameStr, weight, patternStr, caseSensitive));
const bool caseSensitive = flagList.contains(QByteArray("cs"));
const QByteArray patternCs = (caseSensitive ? pattern : pattern.toLower());
const QString patternStr = QString::fromLatin1(patternCs.constData(), patternCs.size());
if (!globs.hasPattern(mimeTypeNameStr, patternStr)) {
globs.append(
Glob(
mimeTypeNameStr,
weight,
patternStr,
caseSensitive
)
);
}
lastMime = mimeTypeName;
lastPattern = pattern;
}
}
return true;
}
void KMimeGlobsFileParser::AllGlobs::addGlob(const Glob& glob)
{
// Note that in each case, we check for duplicates to avoid inserting duplicated patterns.
// This can happen when installing kde.xml and freedesktop.org.xml
// in the same prefix, and they both have text/plain:*.txt
const QString &pattern = glob.pattern;
Q_ASSERT(!pattern.isEmpty());
Q_UNUSED(pattern);
//kDebug() << "pattern" << pattern << "glob.weight=" << glob.weight << glob.flags;
// Store each patterns into either m_fastPatternDict (*.txt, *.html etc. with default weight 50)
// or for the rest, like core.*, *.tar.bz2, *~, into highWeightPatternOffset (>50)
// or lowWeightPatternOffset (<=50)
Glob adjustedGlob(glob);
if (!adjustedGlob.casesensitive)
adjustedGlob.pattern = adjustedGlob.pattern.toLower();
if (adjustedGlob.weight >= 50) {
if (!m_highWeightGlobs.hasPattern(adjustedGlob.mimeType, adjustedGlob.pattern))
m_highWeightGlobs.append(adjustedGlob);
} else {
if (!m_lowWeightGlobs.hasPattern(adjustedGlob.mimeType, adjustedGlob.pattern))
m_lowWeightGlobs.append(adjustedGlob);
}
}
KMimeGlobsFileParser::PatternsMap KMimeGlobsFileParser::AllGlobs::patternsMap() const
{
PatternsMap patMap;
patMap.reserve(m_highWeightGlobs.size() + m_lowWeightGlobs.size());
// This is just to fill in KMimeType::patterns. This has no real effect
// on the actual mimetype matching.
Q_FOREACH(const Glob& glob, m_highWeightGlobs)
patMap[glob.mimeType].append(glob.pattern);
Q_FOREACH(const Glob& glob, m_lowWeightGlobs)
patMap[glob.mimeType].append(glob.pattern);
return patMap;
}
void KMimeGlobsFileParser::AllGlobs::removeMime(const QString& mime)
{
m_highWeightGlobs.removeMime(mime);
m_lowWeightGlobs.removeMime(mime);
}

View file

@ -25,6 +25,7 @@
#include <QIODevice>
#include <QString>
#include <QStringList>
/**
* @internal
@ -32,13 +33,7 @@
class KMimeGlobsFileParser
{
public:
class AllGlobs;
// Read globs (patterns) files
static AllGlobs parseGlobs();
// Separate method, for unit test
static AllGlobs parseGlobs(const QStringList &globFiles);
typedef QHash<QString, QStringList> PatternsMap; // mimetype -> patterns
struct Glob {
Glob(const QString &mime, int w = 50, const QString &pat = QString(), bool cs = false)
@ -52,7 +47,8 @@ public:
class GlobList : public QList<Glob>
{
public:
bool hasPattern(const QString &mime, const QString &pattern) const {
bool hasPattern(const QString &mime, const QString &pattern) const
{
const_iterator it = begin();
const const_iterator myend = end();
for (; it != myend; ++it)
@ -61,37 +57,32 @@ public:
return false;
}
// "noglobs" is very rare occurrence, so it's ok if it's slow
void removeMime(const QString& mime) {
void removeMime(const QString &mime)
{
QMutableListIterator<Glob> it(*this);
while (it.hasNext()) {
if (it.next().mimeType == mime)
it.remove();
}
}
// for tests
PatternsMap patternsMap() const
{
PatternsMap patMap;
patMap.reserve(this->size());
const_iterator it = begin();
const const_iterator myend = end();
for (; it != myend; ++it)
patMap[(*it).mimeType].append((*it).pattern);
return patMap;
}
};
typedef QHash<QString, QStringList> PatternsMap; // mimetype -> patterns
// Read globs (patterns) files
static GlobList parseGlobs(const QStringList &globFiles);
/**
* Result of the globs parsing, as data structures ready for efficient mimetype matching.
* This contains:
* 1) a map of fast regular patterns (e.g. *.txt is stored as "txt" in a qhash's key)
* 2) a linear list of high-weight globs
* 3) a linear list of low-weight globs
* The mime-matching algorithms on top of these data structures are in KMimeTypeFactory.
*/
class AllGlobs
{
public:
void addGlob(const Glob &glob);
void removeMime(const QString &mime);
PatternsMap patternsMap() const; // for KMimeTypeFactory
GlobList m_highWeightGlobs; // >= 50 patterns
GlobList m_lowWeightGlobs; // < 50 patterns
};
static bool parseGlobFile(QIODevice *file, AllGlobs &globs);
static bool parseGlobFile(QIODevice *file, GlobList &globs);
};
#endif /* KMIMEFILEPARSER_H */

View file

@ -110,8 +110,8 @@ void KMimeTypeRepository::parseMimeData()
{
QMutexLocker locker(&m_mutex);
KMimeGlobsFileParser parser;
m_globs = parser.parseGlobs();
const QStringList globFiles = KGlobal::dirs()->findAllResources("xdgdata-mime", QString::fromLatin1("globs2"));
m_globs = KMimeGlobsFileParser::parseGlobs(globFiles);
m_aliases.clear();
const QStringList aliasFiles = KGlobal::dirs()->findAllResources("xdgdata-mime", QLatin1String("aliases"));
@ -286,31 +286,19 @@ bool KMimeTypeRepository::matchFileName(const QString &filename, const QString &
return rx.exactMatch(filename);
}
// Helper for findFromFileName
void KMimeTypeRepository::findFromOtherPatternList(QStringList &matchingMimeTypes,
const QString &fileName,
QString &foundExt,
bool highWeight) const
QStringList KMimeTypeRepository::findFromFileName(const QString &fileName, QString *pMatchingExtension) const
{
const KMimeGlobsFileParser::GlobList patternList = highWeight ? m_globs.m_highWeightGlobs : m_globs.m_lowWeightGlobs;
QStringList matchingMimeTypes;
QString foundExt;
int matchingPatternLength = 0;
qint32 lastMatchedWeight = 0;
if (!highWeight && !matchingMimeTypes.isEmpty()) {
// We found matches in the fast pattern dict already:
matchingPatternLength = foundExt.length() + 2; // *.foo -> length=5
lastMatchedWeight = 50;
}
// "Applications MUST match globs case-insensitively, except when the case-sensitive
// attribute is set to true."
// KMimeGlobsFileParser takes care of putting case-insensitive patterns in lowercase.
const QString lowerCaseFileName = fileName.toLower();
KMimeGlobsFileParser::GlobList::const_iterator it = patternList.constBegin();
const KMimeGlobsFileParser::GlobList::const_iterator end = patternList.constEnd();
for ( ; it != end; ++it ) {
const KMimeGlobsFileParser::Glob& glob = *it;
foreach (const KMimeGlobsFileParser::Glob &glob, m_globs) {
if (matchFileName(glob.casesensitive ? fileName : lowerCaseFileName, glob.pattern)) {
// Is this a lower-weight pattern than the last match? Stop here then.
if (glob.weight < lastMatchedWeight) {
@ -331,25 +319,13 @@ void KMimeTypeRepository::findFromOtherPatternList(QStringList &matchingMimeType
// remember the new "longer" length
matchingPatternLength = glob.pattern.length();
}
matchingMimeTypes.push_back(glob.mimeType);
matchingMimeTypes.append(glob.mimeType);
lastMatchedWeight = glob.weight;
if (glob.pattern.startsWith(QLatin1String("*."))) {
foundExt = glob.pattern.mid(2);
}
}
}
}
QStringList KMimeTypeRepository::findFromFileName(const QString &fileName, QString *pMatchingExtension) const
{
// First try the high weight matches (>=50), if any.
QStringList matchingMimeTypes;
QString foundExt;
findFromOtherPatternList(matchingMimeTypes, fileName, foundExt, true);
if (matchingMimeTypes.isEmpty() || foundExt.isEmpty()) {
// Try the low weight matches (<50)
findFromOtherPatternList(matchingMimeTypes, fileName, foundExt, false);
}
if (pMatchingExtension) {
*pMatchingExtension = foundExt;

View file

@ -128,20 +128,6 @@ private:
*/
QList<KMimeMagicRule> parseMagicFile(QIODevice *file, const QString &fileName) const;
/**
* Look into either the high-weight patterns or the low-weight patterns.
* @param matchingMimeTypes in/out parameter. In: the already found mimetypes;
* this is only set when the fast pattern dict found matches (i.e. weight 50)
* and we want to check if there are other, longer, weight 50 matches.
* @param filename the filename we are trying to match
* @param foundExt in/out parameter, the recognized extension of the match
* @param highWeight whether to look into >50 or <=50 patterns.
*/
void findFromOtherPatternList(QStringList &matchingMimeTypes,
const QString &filename,
QString &foundExt,
bool highWeight) const;
typedef QHash<QString, QString> AliasesMap;
AliasesMap m_aliases; // alias -> canonicalName
@ -153,7 +139,7 @@ private:
bool m_useFavIconsChecked;
int m_sharedMimeInfoVersion;
QList<KMimeMagicRule> m_magicRules;
KMimeGlobsFileParser::AllGlobs m_globs;
KMimeGlobsFileParser::GlobList m_globs;
KMimeType::Ptr m_defaultMimeType;
QMutex m_mutex;
};

View file

@ -45,10 +45,9 @@ private Q_SLOTS:
"40:text/plain:*.kmimefileparserunittest\n"
"20:text/plain:*.kmimefileparserunittest2::futureextension";
QBuffer buf(&testFile);
KMimeGlobsFileParser::AllGlobs mimeTypeGlobs;
QVERIFY(KMimeGlobsFileParser::parseGlobFile(&buf, mimeTypeGlobs));
//kDebug() << mimeTypeGlobs.keys();
const KMimeGlobsFileParser::GlobList textGlobs = mimeTypeGlobs.m_lowWeightGlobs;
KMimeGlobsFileParser::GlobList textGlobs;
QVERIFY(KMimeGlobsFileParser::parseGlobFile(&buf, textGlobs));
//kDebug() << textGlobs.keys();
QCOMPARE(textGlobs.count(), 2);
QCOMPARE(textGlobs[0].pattern, ext1);
QCOMPARE(textGlobs[0].mimeType, QString("text/plain"));
@ -74,7 +73,7 @@ private Q_SLOTS:
const QString fileName = globTempFile.fileName();
globTempFile.close();
KMimeGlobsFileParser::AllGlobs globs = parser.parseGlobs(QStringList() << fileName);
KMimeGlobsFileParser::GlobList globs = parser.parseGlobs(QStringList() << fileName);
const QStringList textPlainPatterns = globs.patternsMap().value("text/plain");
QVERIFY(textPlainPatterns.contains(ext1));
@ -109,7 +108,7 @@ private Q_SLOTS:
const QString fileName2 = globTempFile2.fileName();
globTempFile2.close();
KMimeGlobsFileParser::AllGlobs globs = parser.parseGlobs(QStringList() << fileName1 << fileName2);
KMimeGlobsFileParser::GlobList globs = parser.parseGlobs(QStringList() << fileName1 << fileName2);
const QStringList textPlainPatterns = globs.patternsMap().value("text/plain");
kDebug() << textPlainPatterns;
@ -144,7 +143,7 @@ private Q_SLOTS:
const QString fileName2 = globTempFile2.fileName();
globTempFile2.close();
KMimeGlobsFileParser::AllGlobs globs = parser.parseGlobs(QStringList() << fileName1 << fileName2);
KMimeGlobsFileParser::GlobList globs = parser.parseGlobs(QStringList() << fileName1 << fileName2);
const QStringList textPlainPatterns = globs.patternsMap().value("text/plain");
kDebug() << textPlainPatterns;
@ -179,8 +178,7 @@ private Q_SLOTS:
const QStringList globFiles = KGlobal::dirs()->findAllResources("xdgdata-mime", "globs2");
m_allGlobs = KMimeGlobsFileParser::parseGlobs(globFiles);
m_patternsMap = m_allGlobs.patternsMap();
const int patCount = m_allGlobs.m_highWeightGlobs.count() + m_allGlobs.m_lowWeightGlobs.count();
qDebug() << m_patternsMap.count() << "mimetypes," << patCount << "patterns";
qDebug() << m_patternsMap.count() << "mimetypes," << m_allGlobs.count() << "patterns";
}
void testGlobMatchingPerformance()
@ -244,7 +242,7 @@ private:
}
private:
KMimeGlobsFileParser::AllGlobs m_allGlobs;
KMimeGlobsFileParser::GlobList m_allGlobs;
KMimeGlobsFileParser::PatternsMap m_patternsMap;
};