diff --git a/mfr/core/exceptions.py b/mfr/core/exceptions.py index 8f3573b54..fb0c86835 100644 --- a/mfr/core/exceptions.py +++ b/mfr/core/exceptions.py @@ -145,6 +145,27 @@ def __init__(self, message, *args, metadata_url: str='', response: str='', **kwa 'response': self.response }]) + +class QueryParameterError(ProviderError): + """The MFR related errors raised from a :class:`mfr.core.provider`and relating to query + parameters. This error is thrown when the query has an invalid value. + """ + + __TYPE = 'query_parameter' + + def __init__(self, message, *args, url: str='', code: int=400, **kwargs): + super().__init__(message, code=code, *args, **kwargs) + self.url = url + self.return_code = code + self.attr_stack.append(( + self.__TYPE, + { + 'url': self.url, + 'returncode': self.return_code, + } + )) + + class TooBigToRenderError(ProviderError): """If the user tries to render a file larger than a server specified maximum, throw a TooBigToRenderError. diff --git a/mfr/core/provider.py b/mfr/core/provider.py index dac7c0f62..ca1eb4227 100644 --- a/mfr/core/provider.py +++ b/mfr/core/provider.py @@ -1,14 +1,19 @@ -import abc -import markupsafe +from abc import ( + ABCMeta, + abstractmethod, + abstractproperty +) -import furl +from aiohttp import HttpBadRequest +from furl import furl +import markupsafe -from mfr.core import exceptions -from mfr.server import settings +from mfr.core.exceptions import ProviderError from mfr.core.metrics import MetricsRecord +from mfr.server.settings import ALLOWED_PROVIDER_NETLOCS -class BaseProvider(metaclass=abc.ABCMeta): +class BaseProvider(metaclass=ABCMeta): """Base class for MFR Providers. Requires ``download`` and ``metadata`` methods. Validates that the given file url is hosted at a domain listed in `mfr.server.settings.ALLOWED_PROVIDER_DOMAINS`. @@ -16,13 +21,14 @@ class BaseProvider(metaclass=abc.ABCMeta): def __init__(self, request, url, action=None): self.request = request - url_netloc = furl.furl(url).netloc - if url_netloc not in settings.ALLOWED_PROVIDER_NETLOCS: - raise exceptions.ProviderError( + netloc = furl(url).netloc + if netloc not in ALLOWED_PROVIDER_NETLOCS: + raise ProviderError( message="{} is not a permitted provider domain.".format( - markupsafe.escape(url_netloc) + markupsafe.escape(netloc) ), - code=400 + # TODO: using HTTPStatus.BAD_REQUEST fails tests, not sure why and I will take a another look later + code=HttpBadRequest.code ) self.url = url self.action = action @@ -34,28 +40,30 @@ def __init__(self, request, url, action=None): 'url': str(self.url), }) - @abc.abstractproperty + @abstractproperty def NAME(self): raise NotImplementedError - @abc.abstractmethod + @abstractmethod def metadata(self): pass - @abc.abstractmethod + @abstractmethod def download(self): pass class ProviderMetadata: - def __init__(self, name, ext, content_type, unique_key, download_url, stable_id=None): + def __init__(self, name, ext, content_type, unique_key, + download_url, is_public=False, stable_id=None): self.name = name self.ext = ext self.content_type = content_type self.unique_key = unique_key self.download_url = download_url self.stable_id = stable_id + self.is_public = is_public def serialize(self): return { @@ -65,4 +73,5 @@ def serialize(self): 'unique_key': str(self.unique_key), 'download_url': str(self.download_url), 'stable_id': None if self.stable_id is None else str(self.stable_id), + 'is_public': self.is_public, } diff --git a/mfr/core/utils.py b/mfr/core/utils.py index 52ae1c2b8..30190cc13 100644 --- a/mfr/core/utils.py +++ b/mfr/core/utils.py @@ -78,14 +78,29 @@ def make_renderer(name, metadata, file_path, url, assets_url, export_url): :rtype: :class:`mfr.core.extension.BaseRenderer` """ normalized_name = (name and name.lower()) or 'none' + if metadata.is_public: + try: + # Use the public renderer if exist + return driver.DriverManager( + namespace='mfr.public_renderers', + name=normalized_name, + invoke_on_load=True, + invoke_args=(metadata, file_path, url, assets_url, export_url), + ).driver + except: + # If public render does not exist, use default renderer by MFR + # If public render exists but exceptions occurs, delay the exception handling + pass + try: + # Use the default MFR handler return driver.DriverManager( namespace='mfr.renderers', name=normalized_name, invoke_on_load=True, invoke_args=(metadata, file_path, url, assets_url, export_url), ).driver - except RuntimeError: + except: raise exceptions.MakeRendererError( namespace='mfr.renderers', name=normalized_name, diff --git a/mfr/extensions/office365/README.md b/mfr/extensions/office365/README.md new file mode 100644 index 000000000..a43ca9cc4 --- /dev/null +++ b/mfr/extensions/office365/README.md @@ -0,0 +1,20 @@ + +# Office 365 Renderer + + +This renderer uses Office Online to render .docx files for us. If the Office Online URL ever changes, it will also need to be changed here in settings. + +Currently there is no OSF side component for these changes. Once there is, this specific note can be removed. In the meantime in order to test this renderer, you need to go to your local OSF copy of this file: https://github.com/CenterForOpenScience/osf.io/blob/develop/addons/base/views.py#L728-L736 +and add 'public_file' : 1, to the dict. This will send all files as public files. + +Testing this renderer locally is hard. Since Office Online needs access to the files it will not work with private files or ones hosted locally. To see what the docx files will render like, replace the render function with something that looks like this: + +``` + def render(self): + static_url = 'https://files.osf.io/v1/resources//providers/osfstorage/' + url = settings.OFFICE_BASE_URL + download_url.url + return self.TEMPLATE.render(base=self.assets_url, url=url) + +``` + +The file at `static_url` must be publicly available. diff --git a/mfr/extensions/office365/__init__.py b/mfr/extensions/office365/__init__.py new file mode 100644 index 000000000..08833dba1 --- /dev/null +++ b/mfr/extensions/office365/__init__.py @@ -0,0 +1 @@ +from .render import Office365Renderer # noqa diff --git a/mfr/extensions/office365/render.py b/mfr/extensions/office365/render.py new file mode 100644 index 000000000..7b2457369 --- /dev/null +++ b/mfr/extensions/office365/render.py @@ -0,0 +1,44 @@ +import os +from urllib import parse + +from furl import furl +from mako.lookup import TemplateLookup + +from mfr.core.extension import BaseRenderer +from mfr.extensions.office365.settings import OFFICE_BASE_URL + + +class Office365Renderer(BaseRenderer): + """A renderer for .docx files that are publicly available. + + Office online can render `.docx` files to `.pdf` for us. This renderer will only be made + if a query param with `public_file=true` is present. It then generates and embeds an + office online URL into an `iframe` and returns the template. The file it is trying to + render MUST be public. + + Note: The url for the file to convert must be available publicly on the + internet in order for the renderer to access it. This means files stored on + OSF storage locally will not render unless the local server is listening on + external connections and waterbutler is providing urls that are externally + accessible. + """ + + TEMPLATE = TemplateLookup( + directories=[ + os.path.join(os.path.dirname(__file__), 'templates') + ]).get_template('viewer.mako') + + def render(self): + download_url = furl(self.metadata.download_url).set(query='').url + return self.TEMPLATE.render( + base=self.assets_url, + url=OFFICE_BASE_URL + parse.quote(download_url) + ) + + @property + def file_required(self): + return False + + @property + def cache_result(self): + return False diff --git a/mfr/extensions/office365/settings.py b/mfr/extensions/office365/settings.py new file mode 100644 index 000000000..c92ba78e4 --- /dev/null +++ b/mfr/extensions/office365/settings.py @@ -0,0 +1,6 @@ +from mfr import settings + + +config = settings.child('OFFICE365_EXTENSION_CONFIG') + +OFFICE_BASE_URL = 'https://view.officeapps.live.com/op/embed.aspx?src=' diff --git a/mfr/extensions/office365/templates/viewer.mako b/mfr/extensions/office365/templates/viewer.mako new file mode 100644 index 000000000..cfc2840dc --- /dev/null +++ b/mfr/extensions/office365/templates/viewer.mako @@ -0,0 +1,11 @@ + + + + + + diff --git a/mfr/providers/osf/provider.py b/mfr/providers/osf/provider.py index 8fa52b6a4..1b1e49cc5 100644 --- a/mfr/providers/osf/provider.py +++ b/mfr/providers/osf/provider.py @@ -1,33 +1,44 @@ -import os import json import hashlib +from http import HTTPStatus import logging -from urllib.parse import urlparse import mimetypes +import os +from urllib.parse import urlparse import furl import aiohttp from aiohttp.errors import ContentEncodingError -from waterbutler.core import streams - -from mfr.core import exceptions -from mfr.core import provider +from waterbutler.core.streams import ResponseStreamReader + +from mfr.core.exceptions import ( + DownloadError, + MetadataError, + TooBigToRenderError, + QueryParameterError +) +from mfr.core.provider import ( + BaseProvider, + ProviderMetadata +) from mfr.core.utils import sizeof_fmt -from mfr.providers.osf import settings +from mfr.providers.osf.settings import ( + MFR_ACTION_HEADER, + MFR_IDENTIFYING_HEADER +) from mfr.settings import MAX_FILE_SIZE_TO_RENDER -from mfr.core.exceptions import TooBigToRenderError logger = logging.getLogger(__name__) -class OsfProvider(provider.BaseProvider): +class OsfProvider(BaseProvider): """Open Science Framework (https://osf.io) -aware provider. Knows the OSF ecosystem and can request specific metadata for the file referenced by the URL. Can correctly propagate OSF authorization to verify ownership and permisssions of file. """ - UNNEEDED_URL_PARAMS = ('_', 'token', 'action', 'mode', 'displayName') + UNNEEDED_URL_PARAMS = {'_', 'token', 'action', 'mode', 'displayName'} NAME = 'osf' def __init__(self, request, url, action=None): @@ -73,20 +84,23 @@ async def metadata(self): metadata_response = await self._make_request( 'HEAD', download_url, - headers={settings.MFR_ACTION_HEADER: self.action or ''} + headers={MFR_ACTION_HEADER: self.action or ''} ) response_code = metadata_response.status response_reason = metadata_response.reason response_headers = metadata_response.headers await metadata_response.release() - if response_code != 200: - raise exceptions.MetadataError( - 'Failed to fetch file metadata from WaterButler. Received response: ', - 'code {} {}'.format(str(response_code), str(response_reason)), + if response_code != HTTPStatus.OK: + raise MetadataError( + '''Failed to fetch file metadata from WaterButler. + Received response: code {} {}'''.format( + str(response_code), + str(response_reason) + ), metadata_url=download_url, response=response_reason, provider=self.NAME, - code=400 + code=HTTPStatus.BAD_REQUEST ) try: @@ -97,14 +111,16 @@ async def metadata(self): self.metrics.add('metadata.raw', metadata) # e.g., - # metadata = {'data': { - # 'name': 'blah.png', - # 'contentType': 'image/png', - # 'etag': 'ABCD123456...', - # 'extra': { - # ... - # }, - # }} + # metadata = { + # 'data': { + # 'name': 'blah.png', + # 'contentType': 'image/png', + # 'etag': 'ABCD123456...', + # 'extra': { + # ... + # }, + # } + # } name, ext = os.path.splitext(metadata['data']['name']) size = metadata['data']['size'] @@ -112,9 +128,10 @@ async def metadata(self): max_file_size = MAX_FILE_SIZE_TO_RENDER.get(ext) if max_file_size and size and int(size) > max_file_size: raise TooBigToRenderError( - "This file with extension '{ext}' exceeds the size limit of {max_size} and will not " - "be rendered. To view this file download it and view it " - "offline.".format(ext=ext, max_size=sizeof_fmt(max_file_size)), + '''This file with extension '{ext}' exceeds the size limit of {max_size} and will + not be rendered. To view this file download it and view it offline.'''.format( + ext=ext, max_size=sizeof_fmt(max_file_size) + ), requested_size=int(size), maximum_size=max_file_size, ) @@ -128,19 +145,38 @@ async def metadata(self): stable_str = '/{}/{}{}'.format(meta['resource'], meta['provider'], meta['path']) stable_id = hashlib.sha256(stable_str.encode('utf-8')).hexdigest() logger.debug('stable_identifier: str({}) hash({})'.format(stable_str, stable_id)) - - return provider.ProviderMetadata(name, ext, content_type, unique_key, download_url, stable_id) + is_public = False + public_file = cleaned_url.args.get('public_file', None) + if public_file: + if public_file not in {'True', 'False'}: + raise QueryParameterError( + 'Invalid value for query parameter `public_file`: {}'.format(cleaned_url.args['public_file']), + url=download_url, + provider=self.NAME, + code=HTTPStatus.BAD_REQUEST, + ) + is_public = public_file == 'True' + + return ProviderMetadata( + name, + ext, + content_type, + unique_key, + download_url, + stable_id, + is_public=is_public + ) async def download(self): """Download file from WaterButler, returning stream.""" download_url = await self._fetch_download_url() - headers = {settings.MFR_IDENTIFYING_HEADER: '1'} + headers = {MFR_IDENTIFYING_HEADER: '1'} response = await self._make_request('GET', download_url, allow_redirects=False, headers=headers) - if response.status >= 400: + if response.status >= HTTPStatus.BAD_REQUEST: resp_text = await response.text() logger.error('Unable to download file: ({}) {}'.format(response.status, resp_text)) - raise exceptions.DownloadError( + raise DownloadError( 'Unable to download the requested file, please try again later.', download_url=download_url, response=resp_text, @@ -148,12 +184,12 @@ async def download(self): ) self.metrics.add('download.saw_redirect', False) - if response.status in (302, 301): + if response.status in {HTTPStatus.MOVED_PERMANENTLY, HTTPStatus.FOUND}: await response.release() response = await aiohttp.request('GET', response.headers['location']) self.metrics.add('download.saw_redirect', True) - return streams.ResponseStreamReader(response) + return ResponseStreamReader(response) async def _fetch_download_url(self): """Provider needs a WaterButler URL to download and get metadata. If ``url`` is already @@ -181,8 +217,8 @@ async def _fetch_download_url(self): await request.release() logger.debug('osf-download-resolver: request.status::{}'.format(request.status)) - if request.status != 302: - raise exceptions.MetadataError( + if request.status != HTTPStatus.FOUND: + raise MetadataError( request.reason, metadata_url=self.url, provider=self.NAME, diff --git a/setup.py b/setup.py index ecfe3aebd..08931112a 100755 --- a/setup.py +++ b/setup.py @@ -43,6 +43,9 @@ def parse_requirements(requirements): 'http = mfr.providers.http:HttpProvider', 'osf = mfr.providers.osf:OsfProvider', ], + 'mfr.public_renderers': [ + '.docx = mfr.extensions.office365:Office365Renderer', + ], 'mfr.exporters': [ # google docs '.gdraw = mfr.extensions.image:ImageExporter', diff --git a/tests/extensions/office365/__init__.py b/tests/extensions/office365/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/extensions/office365/test_renderer.py b/tests/extensions/office365/test_renderer.py new file mode 100644 index 000000000..5d90d3990 --- /dev/null +++ b/tests/extensions/office365/test_renderer.py @@ -0,0 +1,54 @@ +from urllib import parse + +import furl +import pytest + +from mfr.core.provider import ProviderMetadata +from mfr.extensions.office365 import Office365Renderer +from mfr.extensions.office365 import settings as office365_settings + + +@pytest.fixture +def metadata(): + return ProviderMetadata( + 'test', + '.pdf', + 'text/plain', + '1234', + 'http://wb.osf.io/file/test.pdf?token=1234&public_file=1', + is_public=True + ) + + +@pytest.fixture +def file_path(): + return '/tmp/test.docx' + + +@pytest.fixture +def url(): + return parse.quote('http://osf.io/file/test.pdf') + + +@pytest.fixture +def assets_url(): + return 'http://mfr.osf.io/assets' + + +@pytest.fixture +def export_url(): + return 'http://mfr.osf.io/export?url=' + url() + + +@pytest.fixture +def renderer(metadata, file_path, url, assets_url, export_url): + return Office365Renderer(metadata, file_path, url, assets_url, export_url) + + +class TestOffice365Renderer: + + def test_render_pdf(self, renderer, metadata): + download_url = furl.furl(metadata.download_url).set(query='') + office_render_url = office365_settings.OFFICE_BASE_URL + parse.quote(download_url.url) + body = renderer.render() + assert ''.format(office_render_url) in body diff --git a/tests/server/handlers/test_query_params.py b/tests/server/handlers/test_query_params.py index b707035dc..001c42b27 100644 --- a/tests/server/handlers/test_query_params.py +++ b/tests/server/handlers/test_query_params.py @@ -7,7 +7,7 @@ from tests import utils -class TestRenderHandler(utils.HandlerTestCase): +class TestQueryParamsHandler(utils.HandlerTestCase): @testing.gen_test def test_format_url(self):