Skip to content

Commit fa1e2e9

Browse files
committed
perf(preview): bulk process preview regeneration
This is a WIP [skip ci] Signed-off-by: Anna Larch <anna@nextcloud.com>
1 parent 9afbad3 commit fa1e2e9

File tree

2 files changed

+511
-69
lines changed

2 files changed

+511
-69
lines changed

lib/private/Preview/Storage/LocalPreviewStorage.php

Lines changed: 227 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
use OC\Preview\Db\Preview;
1717
use OC\Preview\Db\PreviewMapper;
1818
use OCP\DB\Exception;
19+
use OCP\DB\QueryBuilder\IQueryBuilder;
1920
use OCP\Files\IMimeTypeDetector;
2021
use OCP\Files\IMimeTypeLoader;
2122
use OCP\Files\IRootFolder;
@@ -30,6 +31,8 @@
3031
use RecursiveIteratorIterator;
3132

3233
class LocalPreviewStorage implements IPreviewStorage {
34+
private const SCAN_BATCH_SIZE = 1000;
35+
3336
public function __construct(
3437
private readonly IConfig $config,
3538
private readonly PreviewMapper $previewMapper,
@@ -117,88 +120,242 @@ public function scan(): int {
117120
if (!file_exists($this->getPreviewRootFolder())) {
118121
return 0;
119122
}
123+
120124
$scanner = new RecursiveDirectoryIterator($this->getPreviewRootFolder());
121125
$previewsFound = 0;
122-
$skipFiles = [];
126+
127+
/**
128+
* Use an associative array keyed by path for O(1) lookup instead of
129+
* the O(n) in_array() the original code used.
130+
*
131+
* @var array<string, true> $skipPaths
132+
*/
133+
$skipPaths = [];
134+
135+
/**
136+
* Pending previews grouped by fileId. A single original file can have
137+
* many preview variants (different sizes/formats), so we group them to
138+
* issue one filecache lookup per original file rather than one per
139+
* preview variant.
140+
*
141+
* @var array<int, list<array{preview: Preview, filePath: string, realPath: string}>> $pendingByFileId
142+
*/
143+
$pendingByFileId = [];
144+
145+
/**
146+
* path_hash => realPath for legacy filecache entries that need to be
147+
* cleaned up. Only populated when $checkForFileCache is true.
148+
*
149+
* @var array<string, string> $pendingPathHashes
150+
*/
151+
$pendingPathHashes = [];
152+
$pendingCount = 0;
153+
123154
foreach (new RecursiveIteratorIterator($scanner) as $file) {
124-
if ($file->isFile() && !in_array((string)$file, $skipFiles, true)) {
125-
$preview = Preview::fromPath((string)$file, $this->mimeTypeDetector);
126-
if ($preview === false) {
127-
$this->logger->error('Unable to parse preview information for ' . $file->getRealPath());
128-
continue;
129-
}
155+
if (!$file->isFile()) {
156+
continue;
157+
}
158+
159+
$filePath = (string)$file;
160+
if (isset($skipPaths[$filePath])) {
161+
continue;
162+
}
163+
164+
$preview = Preview::fromPath($filePath, $this->mimeTypeDetector);
165+
if ($preview === false) {
166+
$this->logger->error('Unable to parse preview information for ' . $file->getRealPath());
167+
continue;
168+
}
169+
170+
$preview->setSize($file->getSize());
171+
$preview->setMtime($file->getMtime());
172+
$preview->setEncrypted(false);
173+
174+
$realPath = $file->getRealPath();
175+
$pendingByFileId[$preview->getFileId()][] = [
176+
'preview' => $preview,
177+
'filePath' => $filePath,
178+
'realPath' => $realPath,
179+
];
180+
$pendingCount++;
181+
182+
if ($checkForFileCache) {
183+
$relativePath = str_replace($this->getRootFolder() . '/', '', $realPath);
184+
$pendingPathHashes[md5($relativePath)] = $realPath;
185+
}
186+
187+
if ($pendingCount >= self::SCAN_BATCH_SIZE) {
188+
$this->connection->beginTransaction();
130189
try {
131-
$preview->setSize($file->getSize());
132-
$preview->setMtime($file->getMtime());
133-
$preview->setEncrypted(false);
134-
135-
$qb = $this->connection->getQueryBuilder();
136-
$result = $qb->select('storage', 'etag', 'mimetype')
137-
->from('filecache')
138-
->where($qb->expr()->eq('fileid', $qb->createNamedParameter($preview->getFileId())))
139-
->setMaxResults(1)
140-
->runAcrossAllShards() // Unavoidable because we can't extract the storage_id from the preview name
141-
->executeQuery()
142-
->fetchAssociative();
143-
144-
if ($result === false) {
145-
// original file is deleted
146-
$this->logger->warning('Original file ' . $preview->getFileId() . ' was not found. Deleting preview at ' . $file->getRealPath());
147-
@unlink($file->getRealPath());
148-
continue;
149-
}
190+
$previewsFound += $this->processScanBatch($pendingByFileId, $pendingPathHashes, $checkForFileCache, $skipPaths);
191+
$this->connection->commit();
192+
} catch (\Exception $e) {
193+
$this->connection->rollBack();
194+
$this->logger->error($e->getMessage(), ['exception' => $e]);
195+
throw $e;
196+
}
197+
$pendingByFileId = [];
198+
$pendingPathHashes = [];
199+
$pendingCount = 0;
200+
}
201+
}
202+
203+
if ($pendingCount > 0) {
204+
$this->connection->beginTransaction();
205+
try {
206+
$previewsFound += $this->processScanBatch($pendingByFileId, $pendingPathHashes, $checkForFileCache, $skipPaths);
207+
$this->connection->commit();
208+
} catch (\Exception $e) {
209+
$this->connection->rollBack();
210+
$this->logger->error($e->getMessage(), ['exception' => $e]);
211+
throw $e;
212+
}
213+
}
214+
215+
return $previewsFound;
216+
}
217+
218+
/**
219+
* Process one batch of preview files collected during scan().
220+
*
221+
* @param array<int, list<array{preview: Preview, filePath: string, realPath: string}>> $pendingByFileId
222+
* @param array<string, string> $pendingPathHashes path_hash => realPath
223+
* @param array<string, true> $skipPaths Modified in place: newly-moved paths are added so the outer iterator skips them.
224+
*/
225+
private function processScanBatch(
226+
array $pendingByFileId,
227+
array $pendingPathHashes,
228+
bool $checkForFileCache,
229+
array &$skipPaths,
230+
): int {
231+
$filecacheByFileId = $this->fetchFilecacheByFileIds(array_keys($pendingByFileId));
232+
$legacyByPathHash = [];
233+
if ($checkForFileCache && $pendingPathHashes !== []) {
234+
$legacyByPathHash = $this->fetchFilecacheByPathHashes(array_keys($pendingPathHashes));
235+
}
236+
237+
$previewsFound = 0;
238+
foreach ($pendingByFileId as $fileId => $items) {
239+
if (!isset($filecacheByFileId[$fileId])) {
240+
// Original file has been deleted – clean up all its previews.
241+
foreach ($items as $item) {
242+
$this->logger->warning('Original file ' . $fileId . ' was not found. Deleting preview at ' . $item['realPath']);
243+
@unlink($item['realPath']);
244+
}
245+
continue;
246+
}
247+
248+
$filecacheRow = $filecacheByFileId[$fileId];
249+
foreach ($items as $item) {
250+
/** @var Preview $preview */
251+
$preview = $item['preview'];
150252

151-
if ($checkForFileCache) {
152-
$relativePath = str_replace($this->getRootFolder() . '/', '', $file->getRealPath());
253+
if ($checkForFileCache) {
254+
$relativePath = str_replace($this->getRootFolder() . '/', '', $item['realPath']);
255+
$pathHash = md5($relativePath);
256+
if (isset($legacyByPathHash[$pathHash])) {
257+
$legacyRow = $legacyByPathHash[$pathHash];
153258
$qb = $this->connection->getQueryBuilder();
154-
$result2 = $qb->select('fileid', 'storage', 'etag', 'mimetype', 'parent')
155-
->from('filecache')
156-
->where($qb->expr()->eq('path_hash', $qb->createNamedParameter(md5($relativePath))))
157-
->runAcrossAllShards()
158-
->setMaxResults(1)
159-
->executeQuery()
160-
->fetchAssociative();
161-
162-
if ($result2 !== false) {
163-
$qb->delete('filecache')
164-
->where($qb->expr()->eq('fileid', $qb->createNamedParameter($result2['fileid'])))
165-
->andWhere($qb->expr()->eq('storage', $qb->createNamedParameter($result2['storage'])))
166-
->executeStatement();
167-
$this->deleteParentsFromFileCache((int)$result2['parent'], (int)$result2['storage']);
168-
}
259+
$qb->delete('filecache')
260+
->where($qb->expr()->eq('fileid', $qb->createNamedParameter($legacyRow['fileid'])))
261+
->andWhere($qb->expr()->eq('storage', $qb->createNamedParameter($legacyRow['storage'])))
262+
->executeStatement();
263+
$this->deleteParentsFromFileCache((int)$legacyRow['parent'], (int)$legacyRow['storage']);
169264
}
265+
}
170266

171-
$preview->setStorageId((int)$result['storage']);
172-
$preview->setEtag($result['etag']);
173-
$preview->setSourceMimetype($this->mimeTypeLoader->getMimetypeById((int)$result['mimetype']));
174-
$preview->generateId();
175-
// try to insert, if that fails the preview is already in the DB
176-
$this->previewMapper->insert($preview);
267+
$preview->setStorageId((int)$filecacheRow['storage']);
268+
$preview->setEtag($filecacheRow['etag']);
269+
$preview->setSourceMimetype($this->mimeTypeLoader->getMimetypeById((int)$filecacheRow['mimetype']));
270+
$preview->generateId();
177271

178-
// Move old flat preview to new format
179-
$dirName = str_replace($this->getPreviewRootFolder(), '', $file->getPath());
180-
if (preg_match('/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9]+/', $dirName) !== 1) {
181-
$previewPath = $this->constructPath($preview);
182-
$this->createParentFiles($previewPath);
183-
$ok = rename($file->getRealPath(), $previewPath);
184-
if (!$ok) {
185-
throw new LogicException('Failed to move ' . $file->getRealPath() . ' to ' . $previewPath);
186-
}
187-
188-
$skipFiles[] = $previewPath;
189-
}
272+
$this->connection->beginTransaction();
273+
try {
274+
$this->previewMapper->insert($preview);
275+
$this->connection->commit();
190276
} catch (Exception $e) {
277+
$this->connection->rollBack();
191278
if ($e->getReason() !== Exception::REASON_UNIQUE_CONSTRAINT_VIOLATION) {
192279
throw $e;
193280
}
194281
}
282+
283+
// Move old flat preview to new nested directory format.
284+
$dirName = str_replace($this->getPreviewRootFolder(), '', $item['filePath']);
285+
if (preg_match('/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9a-e]\/[0-9]+/', $dirName) !== 1) {
286+
$previewPath = $this->constructPath($preview);
287+
$this->createParentFiles($previewPath);
288+
$ok = rename($item['realPath'], $previewPath);
289+
if (!$ok) {
290+
throw new LogicException('Failed to move ' . $item['realPath'] . ' to ' . $previewPath);
291+
}
292+
// Mark the destination so the outer iterator skips it if it encounters the path later.
293+
$skipPaths[$previewPath] = true;
294+
}
295+
195296
$previewsFound++;
196297
}
197298
}
198299

199300
return $previewsFound;
200301
}
201302

303+
/**
304+
* Bulk-fetch filecache rows for a set of fileIds.
305+
*
306+
* @param int[] $fileIds
307+
*/
308+
private function fetchFilecacheByFileIds(array $fileIds): array {
309+
if (empty($fileIds)) {
310+
return [];
311+
}
312+
313+
$result = [];
314+
$qb = $this->connection->getQueryBuilder();
315+
$qb->select('fileid', 'storage', 'etag', 'mimetype')
316+
->from('filecache');
317+
foreach (array_chunk($fileIds, 1000) as $chunk) {
318+
$qb->andWhere(
319+
$qb->expr()->in('fileid', $qb->createNamedParameter($chunk, IQueryBuilder::PARAM_INT_ARRAY))
320+
);
321+
}
322+
$rows = $qb->runAcrossAllShards()
323+
->executeQuery();
324+
while ($row = $rows->fetchAssociative()) {
325+
$result[(int)$row['fileid']] = $row;
326+
}
327+
$rows->closeCursor();
328+
return $result;
329+
}
330+
331+
/**
332+
* Bulk-fetch filecache rows for a set of path_hashes (legacy migration).
333+
*
334+
* @param string[] $pathHashes
335+
*/
336+
private function fetchFilecacheByPathHashes(array $pathHashes): array {
337+
if (empty($pathHashes)) {
338+
return [];
339+
}
340+
341+
$result = [];
342+
$qb = $this->connection->getQueryBuilder();
343+
$qb->select('fileid', 'storage', 'etag', 'mimetype', 'parent', 'path_hash')
344+
->from('filecache');
345+
foreach (array_chunk($pathHashes, 1000) as $chunk) {
346+
$qb->andWhere(
347+
$qb->expr()->in('path_hash', $qb->createNamedParameter($chunk, IQueryBuilder::PARAM_STR_ARRAY))
348+
);
349+
}
350+
$rows = $qb->runAcrossAllShards()
351+
->executeQuery();
352+
while ($row = $rows->fetchAssociative()) {
353+
$result[$row['path_hash']] = $row;
354+
}
355+
$rows->closeCursor();
356+
return $result;
357+
}
358+
202359
/**
203360
* Recursive method that deletes the folder and its parent folders if it's not
204361
* empty.
@@ -210,10 +367,11 @@ private function deleteParentsFromFileCache(int $folderId, int $storageId): void
210367
->where($qb->expr()->eq('parent', $qb->createNamedParameter($folderId)))
211368
->setMaxResults(1)
212369
->runAcrossAllShards()
213-
->executeQuery()
214-
->fetchAssociative();
370+
->executeQuery();
371+
$row = $result->fetchAssociative();
372+
$result->closeCursor();
215373

216-
if ($result !== false) {
374+
if ($row !== false) {
217375
// there are other files in the directory, don't delete yet
218376
return;
219377
}
@@ -225,11 +383,11 @@ private function deleteParentsFromFileCache(int $folderId, int $storageId): void
225383
->where($qb->expr()->eq('fileid', $qb->createNamedParameter($folderId)))
226384
->andWhere($qb->expr()->eq('storage', $qb->createNamedParameter($storageId)))
227385
->setMaxResults(1)
228-
->executeQuery()
229-
->fetchAssociative();
230-
231-
if ($result !== false) {
232-
$parentFolderId = (int)$result['parent'];
386+
->executeQuery();
387+
$row = $result->fetchAssociative();
388+
$result->closeCursor();
389+
if ($row !== false) {
390+
$parentFolderId = (int)$row['parent'];
233391

234392
$qb = $this->connection->getQueryBuilder();
235393
$qb->delete('filecache')

0 commit comments

Comments
 (0)