Merge pull request #2478 from matrix-org/erikj/expire_url_cache_thumbnails
Delete expired url cache data
This commit is contained in:
commit
c9bc4b7031
|
@ -73,19 +73,58 @@ class MediaFilePaths(object):
|
||||||
)
|
)
|
||||||
|
|
||||||
def url_cache_filepath(self, media_id):
|
def url_cache_filepath(self, media_id):
|
||||||
|
# Media id is of the form <DATE><RANDOM_STRING>
|
||||||
|
# E.g.: 2017-09-28-fsdRDt24DS234dsf
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
self.base_path, "url_cache",
|
self.base_path, "url_cache",
|
||||||
media_id[0:2], media_id[2:4], media_id[4:]
|
media_id[:10], media_id[11:]
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def url_cache_filepath_dirs_to_delete(self, media_id):
|
||||||
|
"The dirs to try and remove if we delete the media_id file"
|
||||||
|
return [
|
||||||
|
os.path.join(
|
||||||
|
self.base_path, "url_cache",
|
||||||
|
media_id[:10],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
||||||
def url_cache_thumbnail(self, media_id, width, height, content_type,
|
def url_cache_thumbnail(self, media_id, width, height, content_type,
|
||||||
method):
|
method):
|
||||||
|
# Media id is of the form <DATE><RANDOM_STRING>
|
||||||
|
# E.g.: 2017-09-28-fsdRDt24DS234dsf
|
||||||
|
|
||||||
top_level_type, sub_type = content_type.split("/")
|
top_level_type, sub_type = content_type.split("/")
|
||||||
file_name = "%i-%i-%s-%s-%s" % (
|
file_name = "%i-%i-%s-%s-%s" % (
|
||||||
width, height, top_level_type, sub_type, method
|
width, height, top_level_type, sub_type, method
|
||||||
)
|
)
|
||||||
|
|
||||||
return os.path.join(
|
return os.path.join(
|
||||||
self.base_path, "url_cache_thumbnails",
|
self.base_path, "url_cache_thumbnails",
|
||||||
media_id[0:2], media_id[2:4], media_id[4:],
|
media_id[:10], media_id[11:],
|
||||||
file_name
|
file_name
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def url_cache_thumbnail_directory(self, media_id):
|
||||||
|
# Media id is of the form <DATE><RANDOM_STRING>
|
||||||
|
# E.g.: 2017-09-28-fsdRDt24DS234dsf
|
||||||
|
|
||||||
|
return os.path.join(
|
||||||
|
self.base_path, "url_cache_thumbnails",
|
||||||
|
media_id[:10], media_id[11:],
|
||||||
|
)
|
||||||
|
|
||||||
|
def url_cache_thumbnail_dirs_to_delete(self, media_id):
|
||||||
|
"The dirs to try and remove if we delete the media_id thumbnails"
|
||||||
|
# Media id is of the form <DATE><RANDOM_STRING>
|
||||||
|
# E.g.: 2017-09-28-fsdRDt24DS234dsf
|
||||||
|
return [
|
||||||
|
os.path.join(
|
||||||
|
self.base_path, "url_cache_thumbnails",
|
||||||
|
media_id[:10], media_id[11:],
|
||||||
|
),
|
||||||
|
os.path.join(
|
||||||
|
self.base_path, "url_cache_thumbnails",
|
||||||
|
media_id[:10],
|
||||||
|
),
|
||||||
|
]
|
||||||
|
|
|
@ -36,6 +36,9 @@ import cgi
|
||||||
import ujson as json
|
import ujson as json
|
||||||
import urlparse
|
import urlparse
|
||||||
import itertools
|
import itertools
|
||||||
|
import datetime
|
||||||
|
import errno
|
||||||
|
import shutil
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
@ -70,6 +73,10 @@ class PreviewUrlResource(Resource):
|
||||||
|
|
||||||
self.downloads = {}
|
self.downloads = {}
|
||||||
|
|
||||||
|
self._cleaner_loop = self.clock.looping_call(
|
||||||
|
self._expire_url_cache_data, 30 * 10000
|
||||||
|
)
|
||||||
|
|
||||||
def render_GET(self, request):
|
def render_GET(self, request):
|
||||||
self._async_render_GET(request)
|
self._async_render_GET(request)
|
||||||
return NOT_DONE_YET
|
return NOT_DONE_YET
|
||||||
|
@ -130,7 +137,7 @@ class PreviewUrlResource(Resource):
|
||||||
cache_result = yield self.store.get_url_cache(url, ts)
|
cache_result = yield self.store.get_url_cache(url, ts)
|
||||||
if (
|
if (
|
||||||
cache_result and
|
cache_result and
|
||||||
cache_result["download_ts"] + cache_result["expires"] > ts and
|
cache_result["expires_ts"] > ts and
|
||||||
cache_result["response_code"] / 100 == 2
|
cache_result["response_code"] / 100 == 2
|
||||||
):
|
):
|
||||||
respond_with_json_bytes(
|
respond_with_json_bytes(
|
||||||
|
@ -239,7 +246,7 @@ class PreviewUrlResource(Resource):
|
||||||
url,
|
url,
|
||||||
media_info["response_code"],
|
media_info["response_code"],
|
||||||
media_info["etag"],
|
media_info["etag"],
|
||||||
media_info["expires"],
|
media_info["expires"] + media_info["created_ts"],
|
||||||
json.dumps(og),
|
json.dumps(og),
|
||||||
media_info["filesystem_id"],
|
media_info["filesystem_id"],
|
||||||
media_info["created_ts"],
|
media_info["created_ts"],
|
||||||
|
@ -253,8 +260,7 @@ class PreviewUrlResource(Resource):
|
||||||
# we're most likely being explicitly triggered by a human rather than a
|
# we're most likely being explicitly triggered by a human rather than a
|
||||||
# bot, so are we really a robot?
|
# bot, so are we really a robot?
|
||||||
|
|
||||||
# XXX: horrible duplication with base_resource's _download_remote_file()
|
file_id = datetime.date.today().isoformat() + '_' + random_string(16)
|
||||||
file_id = random_string(24)
|
|
||||||
|
|
||||||
fname = self.filepaths.url_cache_filepath(file_id)
|
fname = self.filepaths.url_cache_filepath(file_id)
|
||||||
self.media_repo._makedirs(fname)
|
self.media_repo._makedirs(fname)
|
||||||
|
@ -328,6 +334,86 @@ class PreviewUrlResource(Resource):
|
||||||
"etag": headers["ETag"][0] if "ETag" in headers else None,
|
"etag": headers["ETag"][0] if "ETag" in headers else None,
|
||||||
})
|
})
|
||||||
|
|
||||||
|
@defer.inlineCallbacks
|
||||||
|
def _expire_url_cache_data(self):
|
||||||
|
"""Clean up expired url cache content, media and thumbnails.
|
||||||
|
"""
|
||||||
|
now = self.clock.time_msec()
|
||||||
|
|
||||||
|
# First we delete expired url cache entries
|
||||||
|
media_ids = yield self.store.get_expired_url_cache(now)
|
||||||
|
|
||||||
|
removed_media = []
|
||||||
|
for media_id in media_ids:
|
||||||
|
fname = self.filepaths.url_cache_filepath(media_id)
|
||||||
|
try:
|
||||||
|
os.remove(fname)
|
||||||
|
except OSError as e:
|
||||||
|
# If the path doesn't exist, meh
|
||||||
|
if e.errno != errno.ENOENT:
|
||||||
|
logger.warn("Failed to remove media: %r: %s", media_id, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
removed_media.append(media_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||||
|
for dir in dirs:
|
||||||
|
os.rmdir(dir)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield self.store.delete_url_cache(removed_media)
|
||||||
|
|
||||||
|
logger.info("Deleted %d entries from url cache", len(removed_media))
|
||||||
|
|
||||||
|
# Now we delete old images associated with the url cache.
|
||||||
|
# These may be cached for a bit on the client (i.e., they
|
||||||
|
# may have a room open with a preview url thing open).
|
||||||
|
# So we wait a couple of days before deleting, just in case.
|
||||||
|
expire_before = now - 2 * 24 * 60 * 60 * 1000
|
||||||
|
yield self.store.get_url_cache_media_before(expire_before)
|
||||||
|
|
||||||
|
removed_media = []
|
||||||
|
for media_id in media_ids:
|
||||||
|
fname = self.filepaths.url_cache_filepath(media_id)
|
||||||
|
try:
|
||||||
|
os.remove(fname)
|
||||||
|
except OSError as e:
|
||||||
|
# If the path doesn't exist, meh
|
||||||
|
if e.errno != errno.ENOENT:
|
||||||
|
logger.warn("Failed to remove media: %r: %s", media_id, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
dirs = self.filepaths.url_cache_filepath_dirs_to_delete(media_id)
|
||||||
|
for dir in dirs:
|
||||||
|
os.rmdir(dir)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
thumbnail_dir = self.filepaths.url_cache_thumbnail_directory(media_id)
|
||||||
|
try:
|
||||||
|
shutil.rmtree(thumbnail_dir)
|
||||||
|
except OSError as e:
|
||||||
|
# If the path doesn't exist, meh
|
||||||
|
if e.errno != errno.ENOENT:
|
||||||
|
logger.warn("Failed to remove media: %r: %s", media_id, e)
|
||||||
|
continue
|
||||||
|
|
||||||
|
removed_media.append(media_id)
|
||||||
|
|
||||||
|
try:
|
||||||
|
dirs = self.filepaths.url_cache_thumbnail_dirs_to_delete(media_id)
|
||||||
|
for dir in dirs:
|
||||||
|
os.rmdir(dir)
|
||||||
|
except:
|
||||||
|
pass
|
||||||
|
|
||||||
|
yield self.store.delete_url_cache_media(removed_media)
|
||||||
|
|
||||||
|
logger.info("Deleted %d media from url cache", len(removed_media))
|
||||||
|
|
||||||
|
|
||||||
def decode_and_calc_og(body, media_uri, request_encoding=None):
|
def decode_and_calc_og(body, media_uri, request_encoding=None):
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
|
@ -62,7 +62,7 @@ class MediaRepositoryStore(SQLBaseStore):
|
||||||
def get_url_cache_txn(txn):
|
def get_url_cache_txn(txn):
|
||||||
# get the most recently cached result (relative to the given ts)
|
# get the most recently cached result (relative to the given ts)
|
||||||
sql = (
|
sql = (
|
||||||
"SELECT response_code, etag, expires, og, media_id, download_ts"
|
"SELECT response_code, etag, expires_ts, og, media_id, download_ts"
|
||||||
" FROM local_media_repository_url_cache"
|
" FROM local_media_repository_url_cache"
|
||||||
" WHERE url = ? AND download_ts <= ?"
|
" WHERE url = ? AND download_ts <= ?"
|
||||||
" ORDER BY download_ts DESC LIMIT 1"
|
" ORDER BY download_ts DESC LIMIT 1"
|
||||||
|
@ -74,7 +74,7 @@ class MediaRepositoryStore(SQLBaseStore):
|
||||||
# ...or if we've requested a timestamp older than the oldest
|
# ...or if we've requested a timestamp older than the oldest
|
||||||
# copy in the cache, return the oldest copy (if any)
|
# copy in the cache, return the oldest copy (if any)
|
||||||
sql = (
|
sql = (
|
||||||
"SELECT response_code, etag, expires, og, media_id, download_ts"
|
"SELECT response_code, etag, expires_ts, og, media_id, download_ts"
|
||||||
" FROM local_media_repository_url_cache"
|
" FROM local_media_repository_url_cache"
|
||||||
" WHERE url = ? AND download_ts > ?"
|
" WHERE url = ? AND download_ts > ?"
|
||||||
" ORDER BY download_ts ASC LIMIT 1"
|
" ORDER BY download_ts ASC LIMIT 1"
|
||||||
|
@ -86,14 +86,14 @@ class MediaRepositoryStore(SQLBaseStore):
|
||||||
return None
|
return None
|
||||||
|
|
||||||
return dict(zip((
|
return dict(zip((
|
||||||
'response_code', 'etag', 'expires', 'og', 'media_id', 'download_ts'
|
'response_code', 'etag', 'expires_ts', 'og', 'media_id', 'download_ts'
|
||||||
), row))
|
), row))
|
||||||
|
|
||||||
return self.runInteraction(
|
return self.runInteraction(
|
||||||
"get_url_cache", get_url_cache_txn
|
"get_url_cache", get_url_cache_txn
|
||||||
)
|
)
|
||||||
|
|
||||||
def store_url_cache(self, url, response_code, etag, expires, og, media_id,
|
def store_url_cache(self, url, response_code, etag, expires_ts, og, media_id,
|
||||||
download_ts):
|
download_ts):
|
||||||
return self._simple_insert(
|
return self._simple_insert(
|
||||||
"local_media_repository_url_cache",
|
"local_media_repository_url_cache",
|
||||||
|
@ -101,7 +101,7 @@ class MediaRepositoryStore(SQLBaseStore):
|
||||||
"url": url,
|
"url": url,
|
||||||
"response_code": response_code,
|
"response_code": response_code,
|
||||||
"etag": etag,
|
"etag": etag,
|
||||||
"expires": expires,
|
"expires_ts": expires_ts,
|
||||||
"og": og,
|
"og": og,
|
||||||
"media_id": media_id,
|
"media_id": media_id,
|
||||||
"download_ts": download_ts,
|
"download_ts": download_ts,
|
||||||
|
@ -238,3 +238,64 @@ class MediaRepositoryStore(SQLBaseStore):
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
return self.runInteraction("delete_remote_media", delete_remote_media_txn)
|
return self.runInteraction("delete_remote_media", delete_remote_media_txn)
|
||||||
|
|
||||||
|
def get_expired_url_cache(self, now_ts):
|
||||||
|
sql = (
|
||||||
|
"SELECT media_id FROM local_media_repository_url_cache"
|
||||||
|
" WHERE expires_ts < ?"
|
||||||
|
" ORDER BY expires_ts ASC"
|
||||||
|
" LIMIT 100"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_expired_url_cache_txn(txn):
|
||||||
|
txn.execute(sql, (now_ts,))
|
||||||
|
return [row[0] for row in txn]
|
||||||
|
|
||||||
|
return self.runInteraction("get_expired_url_cache", _get_expired_url_cache_txn)
|
||||||
|
|
||||||
|
def delete_url_cache(self, media_ids):
|
||||||
|
sql = (
|
||||||
|
"DELETE FROM local_media_repository_url_cache"
|
||||||
|
" WHERE media_id = ?"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _delete_url_cache_txn(txn):
|
||||||
|
txn.executemany(sql, [(media_id) for media_id in media_ids])
|
||||||
|
|
||||||
|
return self.runInteraction("delete_url_cache", _delete_url_cache_txn)
|
||||||
|
|
||||||
|
def get_url_cache_media_before(self, before_ts):
|
||||||
|
sql = (
|
||||||
|
"SELECT media_id FROM local_media_repository"
|
||||||
|
" WHERE created_ts < ?"
|
||||||
|
" ORDER BY created_ts ASC"
|
||||||
|
" LIMIT 100"
|
||||||
|
)
|
||||||
|
|
||||||
|
def _get_url_cache_media_before_txn(txn):
|
||||||
|
txn.execute(sql, (before_ts,))
|
||||||
|
return [row[0] for row in txn]
|
||||||
|
|
||||||
|
return self.runInteraction(
|
||||||
|
"get_url_cache_media_before", _get_url_cache_media_before_txn,
|
||||||
|
)
|
||||||
|
|
||||||
|
def delete_url_cache_media(self, media_ids):
|
||||||
|
def _delete_url_cache_media_txn(txn):
|
||||||
|
sql = (
|
||||||
|
"DELETE FROM local_media_repository"
|
||||||
|
" WHERE media_id = ?"
|
||||||
|
)
|
||||||
|
|
||||||
|
txn.executemany(sql, [(media_id) for media_id in media_ids])
|
||||||
|
|
||||||
|
sql = (
|
||||||
|
"DELETE FROM local_media_repository_thumbnails"
|
||||||
|
" WHERE media_id = ?"
|
||||||
|
)
|
||||||
|
|
||||||
|
txn.executemany(sql, [(media_id) for media_id in media_ids])
|
||||||
|
|
||||||
|
return self.runInteraction(
|
||||||
|
"delete_url_cache_media", _delete_url_cache_media_txn,
|
||||||
|
)
|
||||||
|
|
|
@ -25,7 +25,7 @@ logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Remember to update this number every time a change is made to database
|
# Remember to update this number every time a change is made to database
|
||||||
# schema files, so the users will be informed on server restarts.
|
# schema files, so the users will be informed on server restarts.
|
||||||
SCHEMA_VERSION = 43
|
SCHEMA_VERSION = 44
|
||||||
|
|
||||||
dir_path = os.path.abspath(os.path.dirname(__file__))
|
dir_path = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,36 @@
|
||||||
|
/* Copyright 2017 New Vector Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
CREATE INDEX local_media_repository_url_idx ON local_media_repository(created_ts) WHERE url_cache IS NOT NULL;
|
||||||
|
|
||||||
|
-- we need to change `expires` to `expires_ts` so that we can index on it. SQLite doesn't support
|
||||||
|
-- indices on expressions until 3.9.
|
||||||
|
CREATE TABLE local_media_repository_url_cache_new(
|
||||||
|
url TEXT,
|
||||||
|
response_code INTEGER,
|
||||||
|
etag TEXT,
|
||||||
|
expires_ts BIGINT,
|
||||||
|
og TEXT,
|
||||||
|
media_id TEXT,
|
||||||
|
download_ts BIGINT
|
||||||
|
);
|
||||||
|
|
||||||
|
INSERT INTO local_media_repository_url_cache_new
|
||||||
|
SELECT url, response_code, etag, expires + download_ts, og, media_id, download_ts FROM local_media_repository_url_cache;
|
||||||
|
|
||||||
|
DROP TABLE local_media_repository_url_cache;
|
||||||
|
ALTER TABLE local_media_repository_url_cache_new RENAME TO local_media_repository_url_cache;
|
||||||
|
|
||||||
|
CREATE INDEX local_media_repository_url_cache_expires_idx ON local_media_repository_url_cache(expires_ts);
|
Loading…
Reference in New Issue