Imports, definitions, and data#
This section contains code that will import required Python libraries, set dataset paths, and define processing functions used later in the tutorial.
%matplotlib inline
import pandas as pd
import re
from ast import literal_eval
Import the spaCy library and load the language model#
The spaCy
library provides pre-built natural language processing tools and models.
import spacy
import en_core_web_lg
# Load the model
NLP = en_core_web_lg.load()
Set the dataset path constants#
The following code will store the relative paths of each provided dataset in a constant for reuse throughout this notebook. These files are in the data
folder that was downloaded alongside this notebook.
# Susan B. Anthony
ANTHONY = "data/anthony/susan-b-anthony-papers_2022-10-12.csv"
SPEECHES = "data/anthony/anthony_speech_list.csv"
# Carrie Chapman Catt
CATT = "data/catt/carrie-chapman-catt-papers_2022-10-12.csv"
# Elizabeth Cady Stanton
STANTON = "data/stanton/elizabeth-cady-stanton-papers_2022-10-19.csv"
# Mary Church Terrell
TERRELL = "data/terrell/mary-church-terrell-advocate-for-african-americans-and-women_2023-01-20.csv"
Define processing functions#
The following helper functions were written to process the data. They are loaded here to make the later sections easier to read.
def load_csv(file: str) -> pd.DataFrame:
"""Load each CSV file into a data frame.
Returns:
df (data frame): A data frame containing the data loaded from csv."""
df = pd.read_csv(file, dtype=str)
return df
def string_to_filename(string: str) -> str:
"""Converts an arbitrary string into a valid filename"""
s = re.sub('[^0-9a-z]+', '_', string.lower())
filename = re.sub(r"\_+", '_', s)
return filename
def read_cache(id: str) -> pd.DataFrame:
"""Read a data frame that was cached to file
Returns:
df (data frame): A data frame containing previously cached data
"""
filename = string_to_filename(id)
df = pd.read_pickle(f"outputs/{filename}.pkl")
return df
def write_cache(df: pd.DataFrame, id: str) -> None:
"""Cache a data frame to file"""
filename = string_to_filename(id)
df.to_pickle(f"outputs/{filename}.pkl")
def tokens(text) -> list:
"""Runs NLP process on text input.
Returns:
process (list): A list containing tuples of NLP attributes
for each word in the transcription.
"""
doc = NLP(str(text))
process = ([(token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
token.shape_, token.is_alpha, token.is_stop) for token in doc])
return process
def entities(text) -> list:
"""Runs NER process on text input.
Returns:
process (list): A list containing tuples of NER attributes
for each word in the transciption.
"""
doc = NLP(str(text))
process = [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
return process
def separate_text(df: pd.DataFrame) -> None:
"""Adds new columns to the data frame then loops through the
tokenized text of each row moving each category to the newly
created relevant column."""
# Add new columns (c) to the data frame
for c in ['text',
'stop_words',
'nonalphanums',
'numbers',
'ambigs',
'processed_text']:
df[c] = pd.Series(dtype=str)
# Iterates over a copy of tokenized_text to filter words
# into five categories
for row in range(df.shape[0]):
text_block = df['tokenized_text'].iloc[row].copy()
text = []
stop_words = []
nonalphanums = []
numbers = []
ambigs = []
for idx, word in enumerate(text_block):
# Move stopwords
if word[7] == True:
stop_words.append(text_block[idx])
# Move punctuation and whitespace
elif word[2] in ['PUNCT', 'SPACE', 'CCONJ', 'X', 'SYM']:
nonalphanums.append(text_block[idx])
# Move numbers
elif word[2] == 'NUM':
numbers.append(text_block[idx])
# Move ambiguous transcribed words
elif '?' in word[5]:
ambigs.append(text_block[idx])
# Move text
else:
text.append(text_block[idx])
df['text'].iloc[row] = text
df['stop_words'].iloc[row] = stop_words
df['nonalphanums'].iloc[row] = nonalphanums
df['numbers'].iloc[row] = numbers
df['ambigs'].iloc[row] = ambigs
# Create a processed_text column containing lowercase lemmas
# for all words in list 'text'
df['processed_text'].iloc[row] = [i[1].lower() for i in df['text'].iloc[row]]