Skip to content

Commit

Permalink
Merge pull request #734 from c2corg/sitemaps-xml
Browse files Browse the repository at this point in the history
Sitemaps xml #733
  • Loading branch information
cbeauchesne authored Feb 15, 2019
2 parents 6b8fcb3 + e6567df commit 4fd5dd8
Show file tree
Hide file tree
Showing 6 changed files with 338 additions and 3 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,7 @@
*.pyc
*.egg-info
/.idea
/.vscode
.noseids
Dockerfile
/env_api
4 changes: 3 additions & 1 deletion c2corg_api/caching.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ def create_region(name):
cache_document_version = create_region('version')
cache_document_info = create_region('info')
cache_sitemap = create_region('sitemap')
cache_sitemap_xml = create_region('sitemap_xml')

caches = [
cache_document_cooked,
Expand All @@ -36,7 +37,8 @@ def create_region(name):
cache_document_history,
cache_document_version,
cache_document_info,
cache_sitemap
cache_sitemap,
cache_sitemap_xml
]


Expand Down
105 changes: 105 additions & 0 deletions c2corg_api/tests/views/test_sitemap_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
from c2corg_api.models.route import RouteLocale, Route
from c2corg_api.models.waypoint import WaypointLocale, Waypoint

from c2corg_api.tests.views import BaseTestRest


class TestSitemapXml(BaseTestRest):
def setUp(self): # noqa
super(TestSitemapXml, self).setUp()
self._prefix = '/sitemaps.xml'
self.ui_url = 'https://www.camptocamp.org'
self.schema_url = '{http://www.sitemaps.org/schemas/sitemap/0.9}'

self.waypoint1 = Waypoint(
waypoint_type='summit', elevation=2000,
locales=[
WaypointLocale(
lang='fr', title='Dent de Crolles')
])
self.session.add(self.waypoint1)
self.waypoint2 = Waypoint(
waypoint_type='summit', elevation=4985,
locales=[
WaypointLocale(
lang='en', title='Mont Blanc'),
WaypointLocale(
lang='fr', title='Mont Blanc')
])
self.session.add(self.waypoint2)
self.route = Route(
activities=['skitouring'], elevation_max=1500, elevation_min=700,
locales=[
RouteLocale(
lang='fr', title='Mont Blanc du ciel',
title_prefix='Mont Blanc'
)
])
self.session.add(self.route)
self.session.flush()

def test_get(self):
response = self.app.get(self._prefix, status=200)
sitemaps = response.xml

base_url = 'https://api.camptocamp.org/sitemaps.xml'

def waypoint_filter(s):
return s[0].text == base_url + '/w/0.xml'

def route_filter(s):
return s[0].text == base_url + '/r/0.xml'

self.assertIsNotNone(
next(filter(waypoint_filter, sitemaps), None)
)
self.assertIsNotNone(
next(filter(route_filter, sitemaps), None)
)

def test_get_sitemap_invalid_doc_type(self):
response = self.app.get(self._prefix + '/z/0.xml', status=400)
errors = response.json['errors']
self.assertError(errors, 'doc_type', 'invalid doc_type')

def test_get_sitemap_invalid_page(self):
response = self.app.get(self._prefix + '/a/-123.xml', status=400)
errors = response.json['errors']
self.assertError(errors, 'i', 'invalid i')

def test_get_waypoint_sitemap(self):
response = self.app.get(self._prefix + '/w/0.xml', status=200)
urlset = response.xml

self.assertEqual(len(urlset), 3)
url = urlset[0]

self.assertEqual(url[0].tag, "{}loc".format(self.schema_url))
self.assertEqual(url[1].tag, "{}lastmod".format(self.schema_url))
self.assertEqual(
url[0].text,
"{}/waypoints/{}/fr/dent-de-crolles".format(
self.ui_url,
self.waypoint1.document_id
)
)

def test_get_waypoint_sitemap_no_pages(self):
self.app.get(self._prefix + '/w/1.xml', status=404)

def test_get_route_sitemap(self):
response = self.app.get(self._prefix + '/r/0.xml', status=200)
urlset = response.xml

self.assertEqual(len(urlset), 1)
url = urlset[0]

self.assertEqual(url[0].tag, "{}loc".format(self.schema_url))
self.assertEqual(url[1].tag, "{}lastmod".format(self.schema_url))
self.assertEqual(
url[0].text,
"{}/routes/{}/fr/mont-blanc-mont-blanc-du-ciel".format(
self.ui_url,
self.route.document_id
)
)
225 changes: 225 additions & 0 deletions c2corg_api/views/sitemap_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
import functools
import logging

from c2corg_api import DBSession, caching
from c2corg_api.caching import cache_sitemap_xml
from c2corg_api.models.cache_version import CacheVersion
from c2corg_api.models.document import Document, DocumentLocale
from c2corg_api.models.route import ROUTE_TYPE, RouteLocale
from c2corg_api.models.user_profile import USERPROFILE_TYPE
from c2corg_api.views import cors_policy, etag_cache
from c2corg_api.views.validation import create_int_validator, \
validate_document_type
from c2corg_common.utils.caching import get_or_create
from cornice.resource import resource, view
from pyramid.httpexceptions import HTTPNotFound
from sqlalchemy.sql.functions import func
from math import ceil
from datetime import date, datetime, timezone
from slugify import slugify

log = logging.getLogger(__name__)

# Search engines accept not more than 50000 urls per sitemap,
# and the sitemap files may not exceed 10 MB. With 50000 urls the sitemaps
# are not bigger than 9MB, but to be safe we are using 45000 urls per sitemap.
# see http://www.sitemaps.org/protocol.html
PAGES_PER_SITEMAP = 45000


UI_ENTRY_POINTS = {
'a': 'areas',
'b': 'books',
'c': 'articles',
'i': 'images',
'm': 'maps',
'o': 'outings',
'r': 'routes',
'w': 'waypoints',
'x': 'xreports'
}

validate_page = create_int_validator('i')


@resource(
collection_path='/sitemaps.xml', path='/sitemaps.xml/{doc_type}/{i}.xml',
cors_policy=cors_policy, renderer='string')
class SitemapXml(object):

def __init__(self, request):
self.request = request

@view()
def collection_get(self):
""" Returns a sitemap index file.
See: http://www.sitemaps.org/protocol.html
The response consists of a list of URLs of sitemaps.
<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
<sitemap>
<loc>https://api.camptocamp.org/sitemaps.xml/w/0.xml</loc>
<lastmod>2019-02-11T18:01:49.193770+00:00</lastmod>
</sitemap>
<sitemap>
<loc>https://api.camptocamp.org/sitemaps.xml/a/0.xml</loc>
<lastmod>2019-02-11T18:01:49.193770+00:00</lastmod>
</sitemap>
<sitemap>
<loc>https://api.camptocamp.org/sitemaps.xml/i/0.xml</loc>
<lastmod>2019-02-11T18:01:49.193770+00:00</lastmod>
</sitemap>
<sitemap>
<loc>https://api.camptocamp.org/sitemaps.xml/i/1.xml</loc>
<lastmod>2019-02-11T18:01:49.193770+00:00</lastmod>
</sitemap>
</sitemap>
"""
cache_key = _get_cache_key()
etag_cache(self.request, cache_key)

self.request.response.content_type = "text/xml"

return get_or_create(
cache_sitemap_xml,
cache_key,
_get_sitemap_index
)

@view(validators=[validate_page, validate_document_type])
def get(self):
""" Returns a sitemap file for a given
type and sitemap page number.
"""
doc_type = self.request.validated['doc_type']
i = self.request.validated['i']

self.request.response.content_type = "text/xml"

cache_key = _get_cache_key(doc_type, i)
etag_cache(self.request, cache_key)

return get_or_create(
cache_sitemap_xml,
cache_key,
functools.partial(_get_sitemap, doc_type, i))


def _get_cache_key(doc_type=None, i=None):
if doc_type:
return '{}-{}-{}-{}'.format(
doc_type, i, date.today().isoformat(), caching.CACHE_VERSION)
else:
return '{}-{}'.format(
date.today().isoformat(), caching.CACHE_VERSION)


def _get_sitemap_index():
document_locales_per_type = DBSession. \
query(Document.type, func.count().label('count')). \
join(
DocumentLocale,
Document.document_id == DocumentLocale.document_id). \
filter(Document.type != USERPROFILE_TYPE). \
group_by(Document.type). \
all()

sitemaps = []

now = datetime.utcnow().replace(tzinfo=timezone.utc)
lastmod = now.isoformat()

template = """<sitemap>
<loc>https://api.camptocamp.org/sitemaps.xml/{doc_type}/{i}.xml</loc>
<lastmod>{lastmod}</lastmod>
</sitemap>"""

for doc_type, count in document_locales_per_type:
num_sitemaps = ceil(count / PAGES_PER_SITEMAP)
sitemaps_for_type = [
template.format(
doc_type=doc_type,
i=i,
lastmod=lastmod
)
for i in range(0, num_sitemaps)
]
sitemaps.extend(sitemaps_for_type)

return """<?xml version="1.0" encoding="UTF-8"?>
<sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{}
</sitemapindex>""".format("\n".join(sitemaps))


def _get_sitemap(doc_type, i):
fields = [
Document.document_id, DocumentLocale.lang, DocumentLocale.title,
CacheVersion.last_updated
]

# include `title_prefix` for routes
is_route = doc_type == ROUTE_TYPE
if is_route:
fields.append(RouteLocale.title_prefix)

base_query = DBSession. \
query(*fields). \
select_from(Document). \
join(DocumentLocale,
Document.document_id == DocumentLocale.document_id)

if is_route:
# joining on `RouteLocale.__table_` instead of `RouteLocale` to
# avoid that SQLAlchemy create an additional join on DocumentLocale
base_query = base_query. \
join(RouteLocale.__table__,
DocumentLocale.id == RouteLocale.id)

base_query = base_query. \
join(CacheVersion,
Document.document_id == CacheVersion.document_id). \
filter(Document.redirects_to.is_(None)). \
filter(Document.type == doc_type). \
order_by(Document.document_id, DocumentLocale.lang). \
limit(PAGES_PER_SITEMAP). \
offset(PAGES_PER_SITEMAP * i)

document_locales = base_query.all()

if not document_locales:
raise HTTPNotFound()

ui_entry_point = UI_ENTRY_POINTS[doc_type]

return """<?xml version="1.0" encoding="UTF-8"?>
<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">
{}
</urlset>""".format("\n".join([
_format_page(ui_entry_point, *locale)
for locale in document_locales
]))


def _format_page(
ui_entry_point, doc_id, lang, title, last_updated, title_prefix=None):

page = {
'document_id': doc_id,
'lang': lang,
'lastmod': last_updated.isoformat(),
'ui_entry_point': ui_entry_point
}

if title_prefix:
page['title'] = slugify("{} {}".format(title_prefix, title))
else:
page['title'] = slugify(title)

return """<url>
<loc>https://www.camptocamp.org/{ui_entry_point}/{document_id}/{lang}/{title}</loc>
<lastmod>{lastmod}</lastmod>
<changefreq>weekly</changefreq>
</url>""".format(**page)
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
api:
image: 'docker.io/c2corg/v6_api:markdown-cooker'
image: 'docker.io/c2corg/v6_api:latest'
ports:
- 6543:6543
environment:
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
alembic==0.8.8
apscheduler==3.2.0
bcrypt==3.1.1
colander==1.4
colander==1.7
elasticsearch==2.3.0
elasticsearch_dsl==2.0.0
geoalchemy2==0.4.0
Expand All @@ -18,6 +18,7 @@ pyramid_debugtoolbar==3.0.5
pyramid_mailer==0.14.1
pyramid_tm==1.0.1
python-json-logger==0.1.5
python-slugify==1.2.4
redis==2.10.5
requests==2.20.1
setuptools==28.8.0
Expand Down

0 comments on commit 4fd5dd8

Please sign in to comment.