Skip to content

Commit

Permalink
Issue 19 Use preferred CMR-Search-After method for iterating through …
Browse files Browse the repository at this point in the history
…results (#22)

* Added vcr tests

* Added uniquess test

* More comments

* Updated dependencies

* Refatored fixture location

* Remove DSStore files

* Remove ds store files

* Removed CMR-Search-After header once get method completes

* Removed unnecessary reference to requests_mock

---------

Co-authored-by: Doug Newman <douglas.j.newamn@nasa.gov>
  • Loading branch information
doug-newman-nasa and Doug Newman authored Aug 21, 2023
1 parent 32fb1fb commit 1702100
Show file tree
Hide file tree
Showing 11 changed files with 53,164 additions and 19 deletions.
Binary file added .DS_Store
Binary file not shown.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@ venv/
tags
.venv
*.egg-info
dist
dist
.vscode/*
26 changes: 17 additions & 9 deletions cmr/queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,15 @@ def get(self, limit=2000):
url = self._build_url()

results = []
page = 1
while len(results) < limit:
more_results = True
while more_results == True:

response = get(url, headers=self.headers, params={'page_size': page_size, 'page_num': page})
# Only get what we need
page_size = min(limit - len(results), page_size)
response = get(url, headers=self.headers, params={'page_size': page_size})
if self.headers == None:
self.headers = {}
self.headers['cmr-search-after'] = response.headers['cmr-search-after']

try:
response.raise_for_status()
Expand All @@ -65,13 +70,16 @@ def get(self, limit=2000):
latest = response.json()['feed']['entry']
else:
latest = [response.text]

if len(latest) == 0:
break


results.extend(latest)
page += 1


if page_size > len(response.json()['feed']['entry']) or len(results) >= limit:
more_results = False

# This header is transient. We need to get rid of it before we do another different query
if self.headers['cmr-search-after']:
del self.headers['cmr-search-after']

return results

def hits(self):
Expand Down
367 changes: 358 additions & 9 deletions poetry.lock

Large diffs are not rendered by default.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ requests = "^2.26.0"
[tool.poetry.dev-dependencies]
flake8 = "^4.0.1"
pytest = "^6.2.5"
vcrpy = "^5.1.0"

[build-system]
requires = ["poetry-core>=1.0.0"]
Expand Down
Binary file added tests/fixtures/.DS_Store
Binary file not shown.
20,399 changes: 20,399 additions & 0 deletions tests/fixtures/vcr_cassettes/CYGNSS.yaml

Large diffs are not rendered by default.

18,136 changes: 18,136 additions & 0 deletions tests/fixtures/vcr_cassettes/MOD02QKM.yaml

Large diffs are not rendered by default.

12,068 changes: 12,068 additions & 0 deletions tests/fixtures/vcr_cassettes/MOD02QKM_2000.yaml

Large diffs are not rendered by default.

2,088 changes: 2,088 additions & 0 deletions tests/fixtures/vcr_cassettes/TELLUS_GRAC.yaml

Large diffs are not rendered by default.

95 changes: 95 additions & 0 deletions tests/test_multiple_queries.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import unittest
import json

import vcr
import urllib.request

from cmr.queries import GranuleQuery

my_vcr = vcr.VCR(
record_mode='once',
decode_compressed_response=True,
# Header matching is not set by default, we need that to test the
# search-after functionality is performing correctly.
match_on=['method', 'scheme', 'host', 'port', 'path', 'query', 'headers']
)

def assert_unique_granules_from_results(granules):
"""
When we invoke a search request multiple times we want to ensure that we don't
get the same results back. This is a one shot test as the results are preserved
by VCR but still useful.
"""
granule_ids = []
for granule in granules:
granule_ids.append(granule['title'])

unique_granules = set(granule_ids)
return len(unique_granules) == len(granule_ids)

class TestMultipleQueries(unittest.TestCase):

def test_get_more_than_2000(self):
"""
If we execute a get with a limit of more than 2000
then we expect multiple invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/MOD02QKM.yaml') as cass:
api = GranuleQuery()

granules = api.short_name("MOD02QKM").get(3000)
self.assertEqual(len(granules), 3000)
# Assert all 3000 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed two search results queries
self.assertEqual(len(cass), 2)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get(self):
"""
If we execute a get with no arguments then we expect
to get the maximum no. of granules from a single CMR call (2000)
in a single request
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/MOD02QKM_2000.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("MOD02QKM").get()
self.assertEqual(len(granules), 2000)
# Assert all 2000 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed one search results query
self.assertEqual(len(cass), 1)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get_all_less_than_2k(self):
"""
If we execute a get_all then we expect multiple
invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/TELLUS_GRAC.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("TELLUS_GRAC_L3_JPL_RL06_LND_v04").get_all()
self.assertEqual(len(granules), 163)
# Assert all 163 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed a hits query and one search results query
self.assertEqual(len(cass), 2)
self.assertIsNone(api.headers.get('cmr-search-after'))

def test_get_all_more_than_2k(self):
"""
If we execute a get_all then we expect multiple
invocations of a cmr granule search and
to not fetch back more results than we ask for
"""
with my_vcr.use_cassette('tests/fixtures/vcr_cassettes/CYGNSS.yaml') as cass:
api = GranuleQuery()
granules = api.short_name("CYGNSS_NOAA_L2_SWSP_25KM_V1.2").get_all()
self.assertEqual(len(granules), 2285)
# Assert all 2285 qranule results have unique granule ids
assert_unique_granules_from_results(granules)
# Assert that we performed a hits query and two search results queries
self.assertEqual(len(cass), 3)
self.assertIsNone(api.headers.get('cmr-search-after'))

0 comments on commit 1702100

Please sign in to comment.