From 3595af7bd239f3843aff3ae06df8932cff23173d Mon Sep 17 00:00:00 2001 From: Andrew Godwin Date: Sat, 10 Dec 2022 12:16:08 -0700 Subject: Media proxy, caching and tuning docs Fixes #67 --- .gitignore | 1 + activities/models/post_attachment.py | 4 +- activities/views/posts.py | 2 + docs/installation.rst | 9 +-- docs/tuning.rst | 146 ++++++++++++++++++++++++++++++++--- mediaproxy/__init__.py | 0 mediaproxy/apps.py | 6 ++ mediaproxy/views.py | 101 ++++++++++++++++++++++++ takahe/settings.py | 13 +++- takahe/urls.py | 17 ++++ users/models/identity.py | 4 +- users/views/identity.py | 2 + 12 files changed, 285 insertions(+), 20 deletions(-) create mode 100644 mediaproxy/__init__.py create mode 100644 mediaproxy/apps.py create mode 100644 mediaproxy/views.py diff --git a/.gitignore b/.gitignore index 8308fed..bc256e2 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,7 @@ .vscode /*.env /build +/cache/ /docs/_build /media/ /static-collected diff --git a/activities/models/post_attachment.py b/activities/models/post_attachment.py index 7feaba5..932ae65 100644 --- a/activities/models/post_attachment.py +++ b/activities/models/post_attachment.py @@ -77,13 +77,13 @@ class PostAttachment(StatorModel): elif self.file: return self.file.url else: - return self.remote_url + return f"/proxy/post_attachment/{self.pk}/" def full_url(self): if self.file: return self.file.url else: - return self.remote_url + return f"/proxy/post_attachment/{self.pk}/" ### ActivityPub ### diff --git a/activities/views/posts.py b/activities/views/posts.py index d343567..ccc38fc 100644 --- a/activities/views/posts.py +++ b/activities/views/posts.py @@ -3,6 +3,7 @@ from django.db import models from django.http import JsonResponse from django.shortcuts import get_object_or_404, redirect, render from django.utils.decorators import method_decorator +from django.views.decorators.vary import vary_on_headers from django.views.generic import TemplateView, View from activities.models import Post, PostInteraction, PostInteractionStates, PostStates @@ -15,6 +16,7 @@ from users.shortcuts import by_handle_or_404 @method_decorator( cache_page_by_ap_json("cache_timeout_page_post", public_only=True), name="dispatch" ) +@method_decorator(vary_on_headers("Accept"), name="dispatch") class Individual(TemplateView): template_name = "activities/post.html" diff --git a/docs/installation.rst b/docs/installation.rst index b268377..f8b8937 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -252,9 +252,8 @@ You should select the "Domains" link in the sidebar and create one, and then you will be able to make your first identity. -Scaling -------- +Tuning and Scaling +------------------ -You can run as many copies of the webserver and workers as you like; the main -limitation will be your database server's processing power and number of -allowed connections. +See :doc:`/tuning` for all the things you should tweak as your server gains +users. We recommend setting up caches early on! diff --git a/docs/tuning.rst b/docs/tuning.rst index 9e959ec..4fdc43b 100644 --- a/docs/tuning.rst +++ b/docs/tuning.rst @@ -5,6 +5,39 @@ This page contains a collection of tips and settings that can be used to tune your server based upon its users and the other servers it federates with. +Scaling +------- + +The only bottleneck, and single point of failure in a Takahē installation is +its database; no permanent state is stored elsewhere. + +Provided your database is happy (and PostgreSQL does a very good job of just +using more resources if you give them to it), you can: + +* Run more webserver containers to handle a higher request load (requests + come from both users and other ActivityPub servers trying to forward you + messages). Consider setting up the DEFAULT cache under high request load, too. + +* Run more Stator worker containers to handle a higher processing load (Stator + handles pulling profiles, fanning out messages to followers, and processing + stats, among others). You'll generally see Stator load climb roughly in + relation to the sum of the number of followers each user in your instance has; + a "celebrity" or other popular account will give Stator a lot of work as it + has to send a copy of each of their posts to every follower, separately. + +As you scale up the number of containers, keep the PostgreSQL connection limit +in mind; this is generally the first thing that will fail, as Stator workers in +particular are quite connection-hungry (the parallel nature of their internal +processing means they might be working on 50 different objects at once). It's +generally a good idea to set it as high as your PostgreSQL server will take +(consult PostgreSQL tuning guides for the effect changing that settting has +on memory usage, specifically). + +If you end up having a large server that is running into database performance +problems, please get in touch with us and discuss it; Takahē is young enough +that we need data and insight from those installations to help optimise it more. + + Federating ---------- @@ -17,22 +50,115 @@ Environment Variable: Caching --------- +------- By default Takakē has caching disabled. The caching needs of a server can varying drastically based upon the number of users and how interconnected they are with other servers. -Caching is configured by specifying a cache DSN in the environment variable -``TAKAHE_CACHES_DEFAULT``. The DSN format can be any supported by +There are multiple ways Takahē uses caches: + +* For caching rendered pages and responses, like user profile information. + These caches reduce database load on your server and improve performance. + +* For proxying and caching remote user images and post images. These must be + proxied to protect your users' privacy; also caching these reduces + your server's consumed bandwidth and improves users' loading times. + +The exact caches you can configure are: + +* ``TAKAHE_CACHES_DEFAULT``: Rendered page and response caching + +* ``TAKAHE_CACHES_MEDIA``: Remote post images and user profile header pictures + +* ``TAKAHE_CACHES_AVATARS``: Remote user avatars ("icons") only + +We recommend you set up ``TAKAHE_CACHES_MEDIA`` and ``TAKAHE_CACHES_AVATARS`` +at a bare minimum - proxying these all the time without caching will eat into +your server's bandwidth. + +All caches are configured the same way - with a custom cache URI/URL. We +support anything that is available as part of `django-cache-url `_, but some cache backends will require additional Python packages not installed -by default with Takahē. +by default with Takahē. More discussion on backend is below. + +All items in the cache come with an expiry set - usually one week - but you +can also configure a maximum cache size on dedicated cache datastores like +Memcache. The key names used by the caches do not overlap, so there is +no need to configure different key prefixes for each of Takahē's caches. + + +Backends +~~~~~~~~ + +Redis +##### + +Examples:: + redis://redis:6379/0 + redis://user:password@redis:6379/0 + rediss://user:password@redis:6379/0 + +A Redis-protocol server. Use ``redis://`` for unencrypted communication and +``rediss://`` for TLS. + +Redis has a large item size limit and is suitable for all caches. We recommend +that you keep the DEFAULT cache separate from the MEDIA and AVATARS caches, and +set the ``maxmemory`` on both to appropriate values (the proxying caches will +need more memory than the DEFAULT cache). + + + +Memcache +######## + +Examples:: + memcached://memcache:11211?key_prefix=takahe + memcached://server1:11211,server2:11211 + +A remote Memcache-protocol server (or set of servers). + +Memcached has a 1MB limit per key by default, so this is only suitable for the +DEFAULT cache and not the AVATARS or MEDIA cache. + + +Filesystem +########## + +Examples:: + file:///var/cache/takahe/ + +A cache on the local disk. + +This *will* work with any of the cache backends, but is probably more suitable +for MEDIA and AVATARS. + +Note that if you are running Takahē in a cluster, this cache will not be shared +across different machines. This is not quite as bad as it first seems; it just +means you will have more potential uncached requests until all machines have +a cached copy. + + +Local Memory +############ + +Examples:: + locmem://default + +A local memory cache, inside the Python process. This will consume additional +memory for the process, and should not be used with the MEDIA or AVATARS caches. + + +CDNs +---- + +You can use Takahē with a "read through" CDN that takes over your site's main +domain serving and passes some requests through to Takahē as a backend. -**Examples** +Takahē sets the appropriate ``Vary`` headers to ensure that cache leakage does +not happen, and ``Last-Modified`` and ``ETag`` headers to allow the CDN to +correctly expire cache items. -* LocMem cache for a small server: ``locmem://default`` -* Memcache cache for a service named ``memcache`` in a docker compose file: - ``memcached://memcache:11211?key_prefix=takahe`` -* Multiple memcache cache servers: - ``memcached://server1:11211,server2:11211`` +Takahē does not yet support offloading local media URLs (such as profile images +and post images) to a *separate* CDN URL; this will be coming in the future. diff --git a/mediaproxy/__init__.py b/mediaproxy/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/mediaproxy/apps.py b/mediaproxy/apps.py new file mode 100644 index 0000000..6b87719 --- /dev/null +++ b/mediaproxy/apps.py @@ -0,0 +1,6 @@ +from django.apps import AppConfig + + +class MediaproxyConfig(AppConfig): + default_auto_field = "django.db.models.BigAutoField" + name = "mediaproxy" diff --git a/mediaproxy/views.py b/mediaproxy/views.py new file mode 100644 index 0000000..e799e8b --- /dev/null +++ b/mediaproxy/views.py @@ -0,0 +1,101 @@ +import httpx +from django.conf import settings +from django.core.cache import caches +from django.http import Http404, HttpResponse +from django.shortcuts import get_object_or_404 +from django.views.generic import View + +from activities.models import PostAttachment +from users.models import Identity + + +class BaseCacheView(View): + """ + Base class for caching remote content. + """ + + cache_name = "media" + item_timeout: int | None = None + + def get(self, request, **kwargs): + self.kwargs = kwargs + remote_url = self.get_remote_url() + cache = caches[self.cache_name] + cache_key = "proxy_" + remote_url + # See if it's already cached + cached_content = cache.get(cache_key) + if not cached_content: + # OK, fetch and cache it + try: + remote_response = httpx.get( + remote_url, + headers={"User-Agent": settings.TAKAHE_USER_AGENT}, + follow_redirects=True, + timeout=settings.SETUP.REMOTE_TIMEOUT, + ) + except (httpx.ConnectError, httpx.RequestError): + return HttpResponse(status=502) + if remote_response.status_code >= 400: + return HttpResponse(status=502) + # We got it - shove it into the cache + cached_content = { + "content": remote_response.content, + "mimetype": remote_response.headers.get( + "Content-Type", "application/octet-stream" + ), + } + cache.set(cache_key, cached_content, timeout=self.item_timeout) + return HttpResponse( + cached_content["content"], + headers={ + "Content-Type": cached_content["mimetype"], + }, + ) + + def get_remote_url(self): + raise NotImplementedError() + + +class IdentityIconCacheView(BaseCacheView): + """ + Caches identity icons (avatars) + """ + + cache_name = "avatars" + item_timeout = 86400 * 7 # One week + + def get_remote_url(self): + self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"]) + if self.identity.local or not self.identity.image_uri: + raise Http404() + return self.identity.icon_uri + + +class IdentityImageCacheView(BaseCacheView): + """ + Caches identity profile header images + """ + + item_timeout = 86400 * 7 # One week + + def get_remote_url(self): + self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"]) + if self.identity.local or not self.identity.image_uri: + raise Http404() + return self.identity.image_uri + + +class PostAttachmentCacheView(BaseCacheView): + """ + Caches post media (images only, videos should always be offloaded to remote) + """ + + item_timeout = 86400 * 7 # One week + + def get_remote_url(self): + self.post_attachment = get_object_or_404( + PostAttachment, pk=self.kwargs["attachment_id"] + ) + if not self.post_attachment.is_image(): + raise Http404() + return self.post_attachment.remote_url diff --git a/takahe/settings.py b/takahe/settings.py index f508952..64a523a 100644 --- a/takahe/settings.py +++ b/takahe/settings.py @@ -118,6 +118,12 @@ class Settings(BaseSettings): #: Default cache backend CACHES_DEFAULT: CacheBackendUrl | None = None + #: User icon (avatar) caching backend + CACHES_AVATARS: CacheBackendUrl | None = None + + #: Media caching backend + CACHES_MEDIA: CacheBackendUrl | None = None + PGHOST: str | None = None PGPORT: int | None = 5432 PGNAME: str = "takahe" @@ -167,6 +173,7 @@ INSTALLED_APPS = [ "activities", "users", "stator", + "mediaproxy", ] MIDDLEWARE = [ @@ -351,7 +358,11 @@ if SETUP.MEDIA_BACKEND: else: raise ValueError(f"Unsupported media backend {parsed.scheme}") -CACHES = {"default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://")} +CACHES = { + "default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://"), + "avatars": django_cache_url.parse(SETUP.CACHES_AVATARS or "dummy://"), + "media": django_cache_url.parse(SETUP.CACHES_MEDIA or "dummy://"), +} if SETUP.ERROR_EMAILS: ADMINS = [("Admin", e) for e in SETUP.ERROR_EMAILS] diff --git a/takahe/urls.py b/takahe/urls.py index 66f176d..98d1cd5 100644 --- a/takahe/urls.py +++ b/takahe/urls.py @@ -5,6 +5,7 @@ from django.views.static import serve from activities.views import compose, explore, follows, posts, search, timelines from core import views as core +from mediaproxy import views as mediaproxy from stator import views as stator from users.views import activitypub, admin, auth, identity, settings @@ -176,6 +177,22 @@ urlpatterns = [ core.FlatPage.as_view(title="Server Rules", config_option="policy_rules"), name="rules", ), + # Media/image proxy + path( + "proxy/identity_icon//", + mediaproxy.IdentityIconCacheView.as_view(), + name="proxy_identity_icon", + ), + path( + "proxy/identity_image//", + mediaproxy.IdentityImageCacheView.as_view(), + name="proxy_identity_image", + ), + path( + "proxy/post_attachment//", + mediaproxy.PostAttachmentCacheView.as_view(), + name="proxy_post_attachment", + ), # Well-known endpoints and system actor path(".well-known/webfinger", activitypub.Webfinger.as_view()), path(".well-known/host-meta", activitypub.HostMeta.as_view()), diff --git a/users/models/identity.py b/users/models/identity.py index c674bf4..21ac0fd 100644 --- a/users/models/identity.py +++ b/users/models/identity.py @@ -153,7 +153,7 @@ class Identity(StatorModel): if self.icon: return self.icon.url elif self.icon_uri: - return self.icon_uri + return f"/proxy/identity_icon/{self.pk}/" else: return static("img/unknown-icon-128.png") @@ -164,7 +164,7 @@ class Identity(StatorModel): if self.image: return self.image.url elif self.image_uri: - return self.image_uri + return f"/proxy/identity_image/{self.pk}/" @property def safe_summary(self): diff --git a/users/views/identity.py b/users/views/identity.py index 4ca230a..fba640c 100644 --- a/users/views/identity.py +++ b/users/views/identity.py @@ -7,6 +7,7 @@ from django.core import validators from django.http import Http404, JsonResponse from django.shortcuts import redirect from django.utils.decorators import method_decorator +from django.views.decorators.vary import vary_on_headers from django.views.generic import FormView, ListView, TemplateView, View from activities.models import Post, PostInteraction @@ -18,6 +19,7 @@ from users.models import Domain, Follow, FollowStates, Identity, IdentityStates from users.shortcuts import by_handle_or_404 +@method_decorator(vary_on_headers("Accept"), name="dispatch") @method_decorator(cache_page_by_ap_json(public_only=True), name="dispatch") class ViewIdentity(ListView): """ -- cgit v1.2.3