Downloading Newspaper Titles and Batches from Chronicling America#

Downloading newspaper titles and batches can be achieved with similar methods. The only difference is in how the API Search Query URL is constructed.

In this notebook, we will Query and Download all newspaper issues found in Chronicling America for the Daily Critic (LCCN: sn8201440) from 1880-1881:

https://www.loc.gov/collections/chronicling-america/?fa=number_lccn:sn82014402&start_date=1880-01-01&end_date=1881-12-31&fo=json

Importing Modules [Required]#

The following imports are required for the scripts to run properly:


  1. Run the following code below.

    • It will import all the modules you need for this notebook.

    • Do not change anything.

import requests
import os
import pandas as pd
import time
import os
import re

Run Functions and Limits [Required]#

Functions and limits define what will be included and excluded in the search for downloads.


  1. Run the code below.

    • Do not change anything.

  2. When the script is complete, it will tell you how many Newspaper Pages it found from your search.

  3. If you are satisfied with the amount of results, proceed to the next section to run the download.

  4. If you are not satisfied with the amount of results, go back and redo the API Search Query.

'''Run P1 search and get a list of results.'''
def get_item_ids(url, items=[], conditional='True'):
    # Check that the query URL is not an item or resource link.
    exclude = ["loc.gov/item","loc.gov/resource"]
    if any(string in url for string in exclude):
        raise NameError('Your URL points directly to an item or '
                        'resource page (you can tell because "item" '
                        'or "resource" is in the URL). Please use '
                        'a search URL instead. For example, instead '
                        'of \"https://www.loc.gov/item/2009581123/\", '
                        'try \"https://www.loc.gov/maps/?q=2009581123\". ')

    # request pages of 100 results at a time
    params = {"fo": "json", "c": 100, "at": "results,pagination"}
    call = requests.get(url, params=params)
    # Check that the API request was successful
    if (call.status_code==200) & ('json' in call.headers.get('content-type')):
        data = call.json()
        results = data['results']
        for result in results:
            # Filter out anything that's a colletion or web page
            filter_out = ("collection" in result.get("original_format")) \
                    or ("web page" in result.get("original_format")) \
                    or (eval(conditional)==False)
            if not filter_out:
                # Get the link to the item record
                if result.get("id"):
                    item = result.get("id")
                    # Filter out links to Catalog or other platforms
                    if item.startswith("http://www.loc.gov/item"):
                        items.append(item)
        # Repeat the loop on the next page, unless we're on the last page.
        if data["pagination"]["next"] is not None:
            next_url = data["pagination"]["next"]
            get_item_ids(next_url, items, conditional)

        return items
    else:
            print('There was a problem. Try running the cell again, or check your searchURL.')


'''Get a list of image URLs from those results
If an item has 2+ copies/pages, all copies/pages
are included. User selects file format (e.g., tiff).'''
def get_image_urls(id_list, mimetype, items = []):
    print('Generating a list of files to download . . . ')
    #Standardize any spelling varieties supplied by user.
    if mimetype == 'tif':
        mimetype = 'tiff'
    if mimetype == 'jpg':
        mimetype = 'jpeg'
    params = {"fo": "json"}
    for item in id_list:
        call = requests.get(item, params=params)
        if call.status_code == 200:
            data = call.json()
        elif call.status_code == 429:
            print('Too many requests to API. Stopping early.')
            break
        else:
            try:
                time.sleep(15)
                call = requests.get(item, params=params)
                data = call.json()
            except:
                print('Skipping: '+ item)
                continue
        resources = data['resources']
        for resource_index,resource in enumerate(resources):
            resource_url = data['item']['resources'][resource_index]['url']
            for index,file in enumerate(resource['files']):
                image_df = pd.DataFrame(file)

                if mimetype == 'pdf':
                    full_mimetype = 'application/' + mimetype
                else:
                    full_mimetype = 'image/' + mimetype
                selected_format_df = image_df[
                    image_df['mimetype']==full_mimetype
                ]
                try:
                    last_selected_format = selected_format_df.iloc[-1]['url']
                    file_info = {}
                    file_info['image_url'] = last_selected_format
                    file_info['item_id'] = item
                    items.append(file_info)
                except:
                    print('Note: No ' + mimetype +
                          ' files found in '+
                          resource_url + '?sp=' + str(index+1))
        #Pause between requests
        time.sleep(2)
    print('\nFound '+str(len(id_list))+' items')
    print('Found '+str(len(items))+' files to download')
    return items

Define your API Search Query and Generating Download List#

After running the Importing Modules code (above),

  1. Paste your Search Query URL below, into the searchURL = '{URL}'

  2. Edit the file type you wish to download in fileExtension = '{filetype}'. PDF works best. But options Include:

  3. Add the location where you want your files saved to in “saveTo”

  4. When ready, Run the code. This will provide a list of items to be downloaded.

# Perform Query - Paste your API Search Query URL into the searchURL
searchURL = 'https://www.loc.gov/collections/chronicling-america/?fa=number_lccn:sn82014402&start_date=1880-01-01&end_date=1881-12-31&fo=json'

# Add your desired file type (extension). Options Include: pdf, jpeg, and xml (OCR files)
fileExtension = 'pdf'

# get_item_ids
ids = get_item_ids(searchURL, items=[])

# get_image_urls
image_urls_list = get_image_urls(ids, fileExtension, items=[])

print('\nList of files to be downloaded:')
for url in image_urls_list:
    print(url['image_url'])
Generating a list of files to download . . . 

Found 16 items
Found 64 files to download

List of files to be downloaded:
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0006.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0007.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0008.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0009.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0010.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0011.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0012.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880052201/0013.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880062801/0015.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880062801/0016.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880062801/0018.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880062801/0019.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880070301/0021.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880070301/0023.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880070301/0024.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880070301/0025.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880082801/0027.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880082801/0029.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880082801/0030.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880082801/0031.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880083101/0033.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880083101/0034.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880083101/0036.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102986/1880083101/0037.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060101/0005.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060101/0006.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060101/0007.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060101/0008.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060201/0009.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060201/0010.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060201/0011.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060201/0012.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060301/0013.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060301/0014.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060301/0015.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060301/0016.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060401/0017.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060401/0018.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060401/0019.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060401/0020.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060601/0021.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060601/0022.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060601/0023.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060601/0024.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060701/0025.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060701/0026.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060701/0027.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060701/0028.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060801/0029.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060801/0030.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060801/0031.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060801/0032.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060901/0033.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060901/0034.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060901/0035.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881060901/0036.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061001/0037.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061001/0038.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061001/0039.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061001/0040.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061101/0041.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061101/0042.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061101/0043.pdf
https://tile.loc.gov/storage-services/service/ndnp/dlc/batch_dlc_chester_ver02/data/sn82014402/00211102822/1881061101/0044.pdf
# Add your Local saveTo Location (e.g. C:/Downloads/)
saveTo = 'output'

# prompt: create the folder and subfolder if they don't exist

for url in image_urls_list:
    # Extract the folder and filename from the URL
    image_url = url['image_url']  # Extract the image_url value from the dictionary
    batch_name = image_url.split('/')[-6]
    lccn_name = image_url.split('/')[-4]
    reel_name = image_url.split('/')[-3]
    issue_name = image_url.split('/')[-2]
    filename = image_url.split('/')[-1]

    # Create the batch folder if it doesn't exist
    batch_path = os.path.join(saveTo, batch_name)
    if not os.path.exists(batch_path):
        os.makedirs(batch_path)

    # Create the lccn folder if it doesn't exist
    lccn_path = os.path.join(saveTo, batch_name, lccn_name)
    if not os.path.exists(lccn_path):
        os.makedirs(lccn_path)

    # Create the reel folder if it doesn't exist
    reel_path = os.path.join(saveTo, batch_name, lccn_name, reel_name)
    if not os.path.exists(reel_path):
        os.makedirs(reel_path)

    # Create the issue subfolder if it doesn't exist
    issue_path = os.path.join(saveTo, batch_name, lccn_name, reel_name, issue_name)
    if not os.path.exists(issue_path):
        os.makedirs(issue_path)

    # Download the file
    response = requests.get(image_url)
    file_path = os.path.join(saveTo, batch_name, lccn_name, reel_name, issue_name, filename)
    with open(file_path, 'wb') as f:
        f.write(response.content)

# prompt: when download is complete, print the message
print('\nSuccess! Please check your saveTo location to see the saved files.\nYou can also redownload the selected files using the links above.')
Success! Please check your saveTo location to see the saved files.
You can also redownload the selected files using the links above.