Imports, definitions, and data#

This section contains code that will import required Python libraries, set dataset paths, and define processing functions used later in the tutorial.

%matplotlib inline
import pandas as pd
import re
from ast import literal_eval

Import the spaCy library and load the language model#

The spaCy library provides pre-built natural language processing tools and models.

import spacy
import en_core_web_lg

# Load the model
NLP = en_core_web_lg.load()

Set the dataset path constants#

The following code will store the relative paths of each provided dataset in a constant for reuse throughout this notebook. These files are in the data folder that was downloaded alongside this notebook.

# Susan B. Anthony
ANTHONY = "data/anthony/susan-b-anthony-papers_2022-10-12.csv"
SPEECHES = "data/anthony/anthony_speech_list.csv"

# Carrie Chapman Catt
CATT = "data/catt/carrie-chapman-catt-papers_2022-10-12.csv"

# Elizabeth Cady Stanton
STANTON = "data/stanton/elizabeth-cady-stanton-papers_2022-10-19.csv"

# Mary Church Terrell
TERRELL = "data/terrell/mary-church-terrell-advocate-for-african-americans-and-women_2023-01-20.csv"

Define processing functions#

The following helper functions were written to process the data. They are loaded here to make the later sections easier to read.

def load_csv(file: str) -> pd.DataFrame:
    """Load each CSV file into a data frame.
    
    Returns:
        df (data frame): A data frame containing the data loaded from csv."""
    
    df = pd.read_csv(file, dtype=str)
    return df


def string_to_filename(string: str) -> str:
    """Converts an arbitrary string into a valid filename"""

    s = re.sub('[^0-9a-z]+', '_', string.lower())
    filename = re.sub(r"\_+", '_', s)
    return filename


def read_cache(id: str) -> pd.DataFrame:
    """Read a data frame that was cached to file

    Returns:
        df (data frame): A data frame containing previously cached data
    """
    filename = string_to_filename(id)
    df = pd.read_pickle(f"outputs/{filename}.pkl")
    return df


def write_cache(df: pd.DataFrame, id: str) -> None:
    """Cache a data frame to file"""

    filename = string_to_filename(id)
    df.to_pickle(f"outputs/{filename}.pkl")


def tokens(text) -> list:
    """Runs NLP process on text input. 
    
    Returns: 
        process (list): A list containing tuples of NLP attributes 
            for each word in the transcription.
    """
    
    doc = NLP(str(text))
    process = ([(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
                token.shape_, token.is_alpha, token.is_stop) for token in doc])

    return process


def entities(text) -> list:
    """Runs NER process on text input. 
    
    Returns:
        process (list): A list containing tuples of NER attributes 
            for each word in the transciption.
    """
    
    doc = NLP(str(text))
    process = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]

    return process


def separate_text(df: pd.DataFrame) -> None:
    """Adds new columns to the data frame then loops through the 
    tokenized text of each row moving each category to the newly 
    created relevant column."""
    
    # Add new columns (c) to the data frame
    for c in ['text', 
              'stop_words', 
              'nonalphanums', 
              'numbers', 
              'ambigs', 
              'processed_text']:
        df[c] = pd.Series(dtype=str)
    
    # Iterates over a copy of tokenized_text to filter words
    # into five categories
    for row in range(df.shape[0]):
        text_block = df['tokenized_text'].iloc[row].copy()
        
        text = []
        stop_words = []
        nonalphanums = []
        numbers = []
        ambigs = []
    
        for idx, word in enumerate(text_block):
            # Move stopwords
            if word[7] == True:
                stop_words.append(text_block[idx])
            # Move punctuation and whitespace
            elif word[2] in ['PUNCT', 'SPACE', 'CCONJ', 'X', 'SYM']:
                nonalphanums.append(text_block[idx])
            # Move numbers
            elif word[2] == 'NUM':
                numbers.append(text_block[idx])
            # Move ambiguous transcribed words
            elif '?' in word[5]:
                ambigs.append(text_block[idx])
            # Move text
            else:
                text.append(text_block[idx])
                
        df['text'].iloc[row] = text
        df['stop_words'].iloc[row] = stop_words
        df['nonalphanums'].iloc[row] = nonalphanums
        df['numbers'].iloc[row] = numbers
        df['ambigs'].iloc[row] = ambigs
        # Create a processed_text column containing lowercase lemmas 
        # for all words in list 'text'
        df['processed_text'].iloc[row] = [i[1].lower() for i in df['text'].iloc[row]]