Exploratory Data Analysis

Setup

import pandas as pd
from supabase import create_client
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
colours = sns.color_palette("Set2")
import random
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from eda_utils import add_gbp_columns, get_longest_values, print_in_rows, check_names, check_overlap, clean_start_of_text, update_join_table
from stats_builder import make_summary_df, calculate_stats, make_calculated_df, format_stats, format_df
from plots_builder import make_bar_chart
from utils import get_table_from_supabase, build_relationship_cols, build_financial_history, add_grant_statistics
from data_importer import pipe_to_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve all records, in order to start building my analysis dataframes. I will create one dataframe for funder information, and another for grants and recipients information.

#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "financials", "funder_financials"]

for table in tables:
    globals()[table] = get_table_from_supabase(url, key, table)

#get recipients with filter
recipients = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)
all_recipient_ids = set(recipients["recipient_id"].unique())

#get and filter recipient join tables
recipient_join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
for table in recipient_join_tables:
    df = get_table_from_supabase(url, key, table)
    globals()[table] = df[df["recipient_id"].isin(all_recipient_ids)]

The Funders Dataframe

Main Table

funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#add relationship columns
funders_df = build_relationship_cols(funders_df, "registered_num", funder_rels)

#add grant statistics columns
funders_df = add_grant_statistics(funders_df, "registered_num", funder_grants, grants)

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

#format financial columns
float_cols = ["income_latest", "expenditure_latest", "total_given", "avg_grant", "median_grant"]
for col in float_cols:
    if col in funders_df.columns:
        funders_df[col + "_gbp"] = funders_df[col].apply(add_gbp_columns)

Financial History Table

funders_df = build_financial_history(funders_df, "registered_num", funder_financials, financials)

The List Entries

#get list entries
list_entries = get_table_from_supabase(url, key, "list_entries")
funder_list = get_table_from_supabase(url, key, "funder_list")
list_with_info = funder_list.merge(list_entries, on="list_id")

#get list of entries for each funder
list_grouped = list_with_info.groupby("registered_num")["list_info"].apply(list).reset_index()
list_grouped.columns = ["registered_num", "list_entries"]

#merge with funders and replace nans
funders_df = funders_df.merge(list_grouped, on="registered_num", how="left")
funders_df["list_entries"] = funders_df["list_entries"].apply(lambda x: x if isinstance(x, list) else [])

#extend column view, sort and preview funders
pd.set_option("display.max_columns", 100)
funders_df = funders_df.sort_values("total_given_gbp", ascending=False)
funders_df.head()

	registered_num	name	website	activities	objectives	income_latest	expenditure_latest	objectives_activities	achievements_performance	grant_policy	is_potential_sbf	is_on_list	is_nua	causes	areas	beneficiaries	num_grants	total_given	avg_grant	median_grant	income_latest_gbp	expenditure_latest_gbp	total_given_gbp	avg_grant_gbp	median_grant_gbp	income_history	expenditure_history	list_entries
872	288086	FARRER AND CO CHARITABLE TRUST	https://www.farrer.co.uk	THE MAIN PURPOSE OF THE TRUST IS TO APPLY INCO...	IN OR TOWARDS THE GENERAL PURPOSES OF SUCH CHA...	80569.00	78229.00	None	None	None	False	False	False	[General Charitable Purposes]	[Throughout England And Wales]	[Other Charities Or Voluntary Bodies]	14	97590.14	6970.72	4000.00	£80,569.00	£78,229.00	£97,590.14	£6,970.72	£4,000.00	{2020: 78285.0, 2021: 52439.0, 2022: 79299.0, ...	{2020: 76431.0, 2021: 61395.0, 2022: 76795.0, ...	[]
51	1041449	MIRACLES	https://www.miraclesthecharity.org	MIRACLES IS A CHARITY OFFERING HOPE FOR THOSE ...	THE RELIEF OF POVERTY AND HARDSHIP OF PERSONS ...	351143.00	908121.00	THE AIMS AND OBJECTS OF MIRACLES REMAIN TO PRO...	THIRTY-ONE YEARS ON SINCE THE LAUNCH OF MIRACL...	None	False	False	False	[General Charitable Purposes, Education/traini...	[Scotland, Europe, Throughout England]	[The General Public/mankind]	8	96073.00	12009.12	2750.00	£351,143.00	£908,121.00	£96,073.00	£12,009.12	£2,750.00	{2020: 235183.0, 2021: 325035.0, 2022: 303775....	{2020: 322068.0, 2021: 337276.0, 2022: 274309....	[]
968	526956	KELSICK'S EDUCATIONAL FOUNDATION	https://www.kelsick.org.uk	TO PROVIDE GRANTS FOR EDUCATIONAL NEEDS TO YOU...	1)THE PROVISION OF SPECIAL BENEFITS, OF ANY KI...	411338.00	408717.00	THE OBJECTS OF THE CHARITY ARE:\n1. THE PROVIS...	THE KELSICK'S EDUCATION FOUNDATION CONTINUES T...	THE FOUNDATION INVITES APPLICATIONS FOR GRANTS...	False	False	False	[Education/training]	[Cumbria]	[Children/young People, People With Disabiliti...	17	956424.00	56260.24	58800.00	£411,338.00	£408,717.00	£956,424.00	£56,260.24	£58,800.00	{2020: 416991.0, 2021: 387390.0, 2022: 407536....	{2020: 408890.0, 2021: 381641.0, 2022: 382351....	[]
772	260741	A P TAYLOR TRUST	https://www.aptaylortrust.org.uk	THE YEARLY INCOME OF THE FUND PROVIDED BY THE ...	FUND FOR THE USE OF THE INHABITANTS OF THE PAR...	111430.00	136811.00	THE YEARLY INCOME OF THE FUND PROVIDED BY THE ...	DURING THE YEAR, THE CHARITY CHOSE TO OFFER AD...	DURING THE YEAR, A TOTAL AMOUNT OF £36,450 WAS...	False	False	False	[General Charitable Purposes, The Advancement ...	[Hillingdon, Greater London]	[Children/young People, Elderly/old People, Pe...	75	95240.00	1269.87	300.00	£111,430.00	£136,811.00	£95,240.00	£1,269.87	£300.00	{2020: 112954.0, 2021: 105886.0, 2022: 98342.0...	{2020: 114393.0, 2021: 124987.0, 2022: 114089....	[]
806	270718	FRANCIS COALES CHARITABLE FOUNDATION	https://franciscoales.co.uk	TO ASSIST WITH GRANTS FOR THE STRUCTURAL REPAI...	TO PROVIDE FUNDS DIRECTLY OR BY WAY OF GRANT O...	117420.00	118765.00	None	None	None	False	False	False	[Environment/conservation/heritage]	[Buckinghamshire, Hertfordshire, Northamptonsh...	[Other Defined Groups]	3	93953.00	31317.67	5000.00	£117,420.00	£118,765.00	£93,953.00	£31,317.67	£5,000.00	{2020: 111553.0, 2021: 134468.0, 2022: 149302....	{2020: 129452.0, 2021: 132270.0, 2022: 120004....	[]

#get checkpoint folder
# checkpoint_folder = Path("./6.1_checkpoints/")

#create checkpoint - save df to pickle
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# print("Saved funders_df to checkpoint")

The Grants Dataframe

Main Table

grants_df = grants.copy()

#ddd funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name", "registered_num": "funder_num"})

#ddd recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities"]], 
                        on="recipient_id", 
                        how="left")

#define relationships for recipients
recipient_rels = [
    {
        "join_table": recipient_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "recipient_areas"
    },
    {
        "join_table": recipient_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "recipient_causes"
    },
    {
        "join_table": recipient_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "recipient_beneficiaries"
    }
]

#add relationship columns
grants_df = build_relationship_cols(grants_df, "recipient_id", recipient_rels)

#add source of grant
grants_df["source"] = grants_df["grant_id"].apply(lambda x: "Accounts" if str(x).startswith("2") else "360Giving")

#round to 2 decimal places
grants_df = grants_df.round(2)

#format financial columns
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)

#sort and preview grants
grants_df = grants_df.sort_values("grant_title", ascending=True)
grants_df.head()

	grant_title	grant_desc	amount	year	grant_id	source	funder_num	funder_grants_id	funder_name	recipient_id	recipient_grants_id	recipient_name	recipient_activities	recipient_areas	recipient_causes	recipient_beneficiaries	amount_gbp
24592	"A BLUE NEW DEAL FOR UK COSTAL COMMUNITIES" PR...	TOWARDS THE COSTS OF THE 'BLUE NEW DEAL', WHIC...	90000.00	2014	360G-Ellerman-15189	360Giving	263207	44987	JOHN ELLERMAN FOUNDATION	1055254	43799	NEW ECONOMICS FOUNDATION	NEF AIMS TO MAXIMISE WELL-BEING AND SOCIAL JUS...	[Throughout England And Wales]	[Education/training, Economic/community Develo...	[The General Public/mankind]	£90,000.00
29639	"BEFRIENDING COFFEE CLUB"	THE BEFRIENDING COFFEE CLUB WAS A TWO-YEAR PRO...	27313.00	2017	360G-PeoplesHealthTrust-2017_5	360Giving	1125537	41014	PEOPLE'S HEALTH TRUST	1166949	39824	SANDWELL AFRICAN WOMEN ASSOCIATION	1. ADVICE & SUPPORT\rA) GENERAL HELP WITH CLAI...	[Sandwell, West Midlands]	[General Charitable Purposes, The Advancement ...	[Children/young People, Elderly/old People, Pe...	£27,313.00
29462	"BETTER LIVES"	THIS TWO-YEAR PROJECT DELIVERED ACTIVITIES AND...	43982.00	2017	360G-PeoplesHealthTrust-2017_103	360Giving	1125537	40917	PEOPLE'S HEALTH TRUST	1093240	39727	SOUTHEND CARERS	TO PROVIDE INFORMATION, ADVICE AND SUPPORT TO ...	[Southend-on-sea, Essex]	[Disability, Other Charitable Purposes]	[Children/young People, Elderly/old People, Pe...	£43,982.00
24854	"BLUE PANET II - TURNING VIEWRS INTO CHAMPIONS...	TOWARDS A JOINT INITIATIVE BY MCS, GREENPEACE ...	6000.00	2017	360G-Ellerman-17137	360Giving	263207	45242	JOHN ELLERMAN FOUNDATION	1004005	44054	MARINE CONSERVATION SOCIETY	THE MARINE CONSERVATION SOCIETY FIGHTS FOR THE...	[Scotland, Europe, Turks And Caicos Islands, N...	[Education/training, Arts/culture/heritage/sci...	[The General Public/mankind]	£6,000.00
25044	"BODY VESSEL CLAY – WOMEN RETHINKING CERAMICS"...	"BODY VESSEL CLAY – WOMEN RETHINKING CERAMICS"...	30000.00	2021	360G-Ellerman-18459	360Giving	263207	45468	JOHN ELLERMAN FOUNDATION	1123081	44280	BULLDOG TRUST LIMITED	THE TRUST PROVIDES FINANCIAL AND ADVISORY ASSI...	[Throughout England And Wales]	[General Charitable Purposes, Arts/culture/her...	[The General Public/mankind]	£30,000.00

#create checkpoint - save df to pickle
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")
# print("Saved grants_df to checkpoint")

Retrieving Data from Checkpoints

#get checkpoint folder
checkpoint_folder = Path("./6.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

Data Quality

1. Missingness

I will view the structure of each dataframe to check for missing data and confirm that datatypes are correct.

Analysis of Missingness in Funders Dataframe

funders_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 996 entries, 820 to 995
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   registered_num            996 non-null    object 
 1   name                      996 non-null    object 
 2   website                   497 non-null    object 
 3   activities                991 non-null    object 
 4   objectives                996 non-null    object 
 5   income_latest             996 non-null    float64
 6   expenditure_latest        996 non-null    float64
 7   objectives_activities     229 non-null    object 
 8   achievements_performance  193 non-null    object 
 9   grant_policy              52 non-null     object 
 10  is_potential_sbf          996 non-null    bool   
 11  is_on_list                996 non-null    bool   
 12  is_nua                    996 non-null    bool   
 13  causes                    996 non-null    object 
 14  areas                     996 non-null    object 
 15  beneficiaries             996 non-null    object 
 16  num_grants                309 non-null    Int64  
 17  total_given               309 non-null    float64
 18  avg_grant                 309 non-null    float64
 19  median_grant              309 non-null    float64
 20  income_latest_gbp         996 non-null    object 
 21  expenditure_latest_gbp    996 non-null    object 
 22  total_given_gbp           996 non-null    object 
 23  avg_grant_gbp             996 non-null    object 
 24  median_grant_gbp          996 non-null    object 
 25  income_history            996 non-null    object 
 26  expenditure_history       996 non-null    object 
 27  list_entries              996 non-null    object 
dtypes: Int64(1), bool(3), float64(5), object(19)
memory usage: 206.2+ KB

#missing activities - check manually
missing_activities = funders_df[funders_df["activities"].isna()]["registered_num"].tolist()
print(missing_activities)

['1207622', '1203444', '1203389', '1203342', '1204043']

#missing sections - check proportion of funders with sections extracted from accounts
has_sections = funders_df[
    funders_df["objectives_activities"].notna() |
    funders_df["achievements_performance"].notna() |
    funders_df["grant_policy"].notna()
]

has_sections_total = len(has_sections)
has_sections_proportion = has_sections_total / 996

print(f"Funders with accessible accounts: 327")
print(f"Funders with extracted sections: {has_sections_total}")
print(f"Proportion of funders with sections: {has_sections_proportion:.2%}")

Funders with accessible accounts: 327
Funders with extracted sections: 242
Proportion of funders with sections: 24.30%

#missing classifications - check manually
classifications = ["causes", "areas", "beneficiaries"]

for classification in classifications:
    missing = funders_df[funders_df[classification].apply(lambda x: len(x) == 0)]
    print(f"{len(missing)} ({len(missing)/len(funders_df)*100:.1f}%) missing from {classification}")
    print(f"Registered numbers: {missing['registered_num'].tolist()}\n")

0 (0.0%) missing from causes
Registered numbers: []

0 (0.0%) missing from areas
Registered numbers: []

7 (0.7%) missing from beneficiaries
Registered numbers: ['1180365', '1203444', '1180966', '1181515', '1174620', '1191072', '1166657']

Exploration of Findings from Missingness Analysis (Funders)

There are five funders in the database with empty activities. I have checked the Charity Commission website and it does appear that these funders have simply not declared any activities. They are all relatively new having submitted only one set of accounts. The activities_objectives column has not been populated for any of these funders, which is unfortunate as this could have provided a further source of information. One of these funders has a website so, if I am able to achieve my stretch target of scraping websites, this may offer details (although at the time of writing, the website does not exist).

It is expected that a significant proportion of funders would not have a website, and no action is required to address this.

Checking missing classifications shows that the dataset is almost entirely complete in this regard; just seven funders are missing beneficiaries, and having checked the Charity Commission website, it appears that these funders have not declared any beneficiaries. This is such a small proportion of the dataset that I am not concerned about its impact on analysis and embeddings.

Problem with Accounts

My database building scripts (for PDFs) have a serious limitation as I am unable to scrape the Charity Commission website for accounts where the page contains JavaScript. The older pages, which are basic HTML, are accessible but the newer ones are not, and I am unable to tell which charities have been updated to the new system until the script attempts to scrape them and fails. Having built the database, I can now see from the print statements that 327 funders have accessible accounts, representing 32.8% of the 996 funders in the sample. My calculations above show that the required sections have been extracted from accounts for 24.4% of funders - varying from 52 funders with grant_policy, to 193 funders with achievements_peformance and 229 funders with objectives_activities.

Analysis of Missingness in Grants Dataframe

grants_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 33152 entries, 6305 to 33151
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   grant_title              18116 non-null  object 
 1   grant_desc               18116 non-null  object 
 2   amount                   33106 non-null  float64
 3   year                     33152 non-null  int64  
 4   grant_id                 33152 non-null  object 
 5   source                   33152 non-null  object 
 6   funder_num               33152 non-null  object 
 7   funder_grants_id         33152 non-null  int64  
 8   funder_name              33152 non-null  object 
 9   recipient_id             33152 non-null  object 
 10  recipient_grants_id      33152 non-null  int64  
 11  recipient_name           33152 non-null  object 
 12  recipient_activities     15829 non-null  object 
 13  recipient_areas          33152 non-null  object 
 14  recipient_causes         33152 non-null  object 
 15  recipient_beneficiaries  33152 non-null  object 
 16  amount_gbp               33152 non-null  object 
dtypes: float64(1), int64(3), object(13)
memory usage: 4.6+ MB

#missing grant title and description - check numbers per source
missing_by_source = grants_df.groupby("source").agg({
    "grant_title": lambda x: x.isna().sum(),
    "grant_desc": lambda x: x.isna().sum(),
    "grant_id": "count"
}).rename(columns={"grant_id": "total_grants", "grant_title": "missing_title", "grant_desc": "missing_desc"})

print(missing_by_source)

           missing_title  missing_desc  total_grants
source                                              
360Giving              0             0         18116
Accounts           15036         15036         15036

#missing grant amounts - spotcheck manually
missing_amounts = grants_df[grants_df["amount"].isna()]["funder_num"].tolist()
print(missing_amounts)

['217929', '217929', '217929', '217929', '217929', '217929', '217929', '217929', '252222', '1052958', '525834', '244053', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '219289', '1017255', '803301', '217929', '217929', '217929', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '291363', '291363', '291363', '291363', '268745', '1059451']

#missing recipient activities - compare by source
missing_recips = grants_df["recipient_activities"].isna().sum()
accounts_recips = grants_df[
    (grants_df["source"] == "Accounts") &
    (grants_df["recipient_activities"].isna())
]
missing_360g = grants_df[
    (grants_df["source"] == "360Giving") &
    (grants_df["recipient_activities"].isna())
]

print(f"Grants extracted from accounts: {len(accounts_recips):,} ({len(accounts_recips)/missing_recips*100:.1f}%)")
print(f"360Giving recipients with missing activities: {len(missing_360g):,} ({len(missing_360g)/missing_recips*100:.1f}%)")

Grants extracted from accounts: 11,491 (66.3%)
360Giving recipients with missing activities: 5,832 (33.7%)

Exploration of Findings from Missingness Analysis (Grants)

There are 33,147 rows in the dataframe, 287 more than the number of unique grants as some grants are given to multiple recipients. The deduplication script in stats_builder.py ensures that the value of each grant has only been counted once for the purpose of the summary/calculated statistics.

I have explored the missing data from grant_title and grant_desc to check that only grants extracted from accounts are missing these fields. This was to be expected as these details are not available for the grants from accounts; the figures above confirm that every grant from the 360Giving API is complete with a title and description.

Of the 32,815 grants in the database, 46 (0.14%) are missing amount. I have performed a spot-check of 12 grants (25%), which suggests that these are genuine extraction errors from funder accounts rather than systematic issues. I have decided to keep these 46 grants in the database as they still provide valuable information about funder-recipient relationships and giving patterns, and such a small number of missing amounts is not likely to affect analyses/models.

I also explored the 17,318 rows in grants_df with missing recipient_activities. Null values in this column are to be expected for recipients that are added following extraction from PDFs, which accounts for 11,486 (66.3%). The remaining 5,832 (33.7%) null values - which are present within records added to the database by the 360Giving API - can be explained by: - Charities that have been removed from the Charity Commission, e.g. if they have closed down - Registered charities that simply do not declare their activities (as present in 5 records in funders_df) - Where abnormalities are evident in registered_num. For example, the RSPB is identified in the 360Giving API by 207076 & SC037654 - its registered numbers for both the Charity Commission of England & Wales and the Office of the Scottish Charity Regulator. My database builder script was unable to match on 207076 & SC037654 as registered_num.

2. Word Counts

I will check the lengths of the shortest and longest text entries, to ensure that they have been imported correctly and are not too short or long for unexpected reasons. Many funders provide very short explanations of their activities/objectives etc., such as simply “grant-giving” which is just one word - so this would not be abnormal. I will confirm that particularly long entries are not corrupted or the result of multiple documents being combined accidentally.

Analysis of Word Counts in Funders Dataframe

#check word counts for text columns in funders df
funders_text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create columns
for col in funders_text_cols:
    funders_df[f"word_count_{col}"] = funders_df[col].str.split().str.len()
for col in funders_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = funders_df[funders_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = funders_df.loc[[min_idx, max_idx], ["registered_num", "name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")

ACTIVITIES
______________________________

	registered_num	name	word_count_activities	activities
Minimum	277767	WANSTEAD HIGH SCHOOL PARENT-TEACHER ASSOCIATION	1.00	FUNDRAISING
Maximum	217929	BARTON UNDER NEEDWOOD AND DUNSTALL KEY TRUST	76.00	TO PAY THE MINISTER OF THE CHURCH OF ST JAMES ...

OBJECTIVES
______________________________

	registered_num	name	word_count_objectives	objectives
Minimum	213077	EPPING AND THEYDON GARNON CHARITIES	2	SEE CONSTITUENTS
Maximum	1104413	INSPIRE HOUNSLOW	524	3. THE OBJECTS OF THE COMPANY (THE "OBJECTS") ...

OBJECTIVES_ACTIVITIES
______________________________

	registered_num	name	word_count_objectives_activities	objectives_activities
Minimum	1030939	TESLER FOUNDATION	16.00	OBJECTIVES AND AIMS THE CHARITY IS GOVERNED BY...
Maximum	263207	JOHN ELLERMAN FOUNDATION	5426.00	TARGET OF CASH + 4% ANNUALLY, NET OF COSTS. A ...

ACHIEVEMENTS_PERFORMANCE
______________________________

	registered_num	name	word_count_achievements_performance	achievements_performance
Minimum	802125	TOMCHEI TORAH CHARITABLE TRUST	14.00	CHARITABLE ACTIVITIES DURING THE YEAR THE CHAR...
Maximum	263207	JOHN ELLERMAN FOUNDATION	5426.00	TARGET OF CASH + 4% ANNUALLY, NET OF COSTS. A ...

GRANT_POLICY
______________________________

	registered_num	name	word_count_grant_policy	grant_policy
Minimum	1057692	DOWNEND ROUND TABLE	8.00	POLICY ON SOCIAL INVESTMENT INCLUDING PROGRAM ...
Maximum	299918	C A REDFERN CHARITABLE FOUNDATION	1181.00	THE TRUSTEES MEET REGULARLY TO DISCUSS THE MAK...

Word Counts for Sections Returned from Charity Commission API

#high word counts - check top 10 longest sections from API (to manually check)
long_activities = get_longest_values(funders_df, "word_count_activities", "registered_num")
long_objectives = get_longest_values(funders_df, "word_count_objectives", "registered_num")

api_sections_long = [
    ("activities", long_activities),
    ("objectives", long_objectives)
]

for name, longest in api_sections_long:
    print(f"Top 10 longest {name}: {longest}")

Top 10 longest activities: ['217929', '213077', '207342', '298379', '516301', '1135751', '272671', '1114624', '1197536', '1193970']
Top 10 longest objectives: ['1104413', '310799', '1132994', '229974', '1190948', '1194823', '1094675', '1047947', '1173674', '1147921']

#low word counts - count and view short sections from API
api_sections_short = {
    "activities": "Short activities sections",
    "objectives": "Short objectives sections"
}

for col, title in api_sections_short.items():
    funders_df[f"short_{col}_section"] = (
        (funders_df[col].str.split().str.len() < 5) &
        (funders_df[col].str.upper().str.strip() != "GENERAL CHARITABLE PURPOSES")
    )
    short_sections = funders_df[funders_df[f"short_{col}_section"]][col].dropna().unique()
    
    print(f"\n{title}: {funders_df[f'short_{col}_section'].sum():,}")
    print_in_rows(short_sections, 5)


Short activities sections: 39
A GRANT GIVING TRUST, CHARITABLE PURPOSES, GENERAL CHARITABLE PURPOSES., GRANT MAKING, CHARITABLE GIVING
PROJECTS FOR NEEDY, FUNDRAISING, GRANT MAKING CHARITY, GRANT-MAKING CHARITY, GENERAL CHARITABLE DONATIONS
GRANT GIVING., MAKING GRANTS, GRANT-MAKING, ANIMAL WELFARE, AT THE TRUSTEES DISCRETION.
FUNDING CHARITABLE CAUSES., GRANT GIVING CHARITY, GRANT-MAKING., ASSISTANCE WITH RELIGIOUS EDUCATION, ENCOURAGING CHARITABLE GIVING
ADVANCING ORTHODOX JEWISH FAITH, GRANTS TO ADVANCE EDUCATION., RELIEF OF POOR, HELPING DISADVANTAGED CHILDREN, EDUCATION / TRAINING
NONE, PROVIDING GRANTS, GENERAL CHARITABLE ACTIVITIES, THROUGHOUT ENGLAND AND WALES, NONE UNDERTAKEN
CHARITABLE GRANTS MADE, SUPPORT LOCAL CHARITIES.

Short objectives sections: 15
SEE INDIVIDUAL CONSTITUENTS, GENERAL CHARITABLE PURPOSES., SEE INDIVIDUAL CONSTITUENTS., SEE CONSTITUENTS, FOR CHARITABLE PURPOSES.
ANY CHARITABLE PURPOSE.

#low word counts - check activities containing "NONE"
activities_none = funders_df["activities"].str.contains("NONE", na=False)
funders_df[activities_none]

	registered_num	name	website	activities	objectives	income_latest	expenditure_latest	objectives_activities	achievements_performance	grant_policy	is_potential_sbf	is_on_list	is_nua	causes	areas	beneficiaries	num_grants	total_given	avg_grant	median_grant	income_latest_gbp	expenditure_latest_gbp	total_given_gbp	avg_grant_gbp	median_grant_gbp	income_history	expenditure_history	list_entries	word_count_activities	word_count_objectives	word_count_objectives_activities	word_count_achievements_performance	word_count_grant_policy	short_activities_section	short_objectives_section
355	505515	ALLENDALE EXHIBITION ENDOWMENT	None	NONE	THE INCOME OF THE CHARITY SHALL BE APPLIED BY-...	24000.00	14299.00	None	None	None	False	False	False	[Education/training]	[Northumberland]	[Children/young People]	<NA>	NaN	NaN	NaN	£24,000.00	£14,299.00				{2020: 7044.0, 2021: 4100.0, 2022: 13836.0, 20...	{2020: 7190.0, 2021: 5000.0, 2022: 13083.0, 20...	[]	1.00	111	NaN	NaN	NaN	True	False
367	1103709	MARCIA AND ANDREW BROWN CHARITABLE TRUST	None	NONE UNDERTAKEN	TO BENEFIT SUCH EXCLUSIVELY CHARITABLE PURPOSE...	304104.00	190330.00	None	None	None	False	False	False	[General Charitable Purposes]	[Throughout England And Wales]	[The General Public/mankind]	<NA>	NaN	NaN	NaN	£304,104.00	£190,330.00				{2020: 201144.0, 2021: 400020.0, 2022: 1850115...	{2020: 33618.0, 2021: 63195.0, 2022: 87510.0, ...	[]	2.00	18	NaN	NaN	NaN	True	False

Word Counts for Sections Extracted from PDF Accounts

#high word counts - check top 10 longest extracted sections (to manually check)
long_obj_act = get_longest_values(funders_df, "word_count_objectives_activities", "registered_num")
long_ach_perf = get_longest_values(funders_df, "word_count_achievements_performance", "registered_num")
long_grant_policy = get_longest_values(funders_df, "word_count_grant_policy", "registered_num")

accounts_sections_long = [
    ("objectives_activities", long_obj_act),
    ("achievements_performance", long_ach_perf),
    ("grant_policy", long_grant_policy)
]

for name, longest in accounts_sections_long:
    print(f"Top 10 longest {name}: {longest}")

Top 10 longest objectives_activities: ['263207', '803428', '252892', '213890', '1041449', '296284', '217929', '288068', '299918', '260589']
Top 10 longest achievements_performance: ['263207', '803428', '200198', '213890', '1041449', '217929', '299918', '254532', '296284', '288068']
Top 10 longest grant_policy: ['299918', '1038860', '216308', '288068', '257636', '1026752', '272161', '255281', '226121', '1040778']

#low word counts - count and view short sections from accounts
accounts_sections_short = {
    "objectives_activities": "Short objectives_activities sections",
    "achievements_performance": "Short achievements_performance sections",
    "grant_policy": "Short grant_policy sections"
}

for col, title in accounts_sections_short.items():
    funders_df[f"short_{col}_section"] = (
        (funders_df[col].str.split().str.len() < 20)
    )
    short_sections = funders_df[funders_df[f"short_{col}_section"]][col].dropna().unique()
    
    print(f"\n{title}: {funders_df[f'short_{col}_section'].sum():,}")
    print_in_rows(short_sections, 2)


Short objectives_activities sections: 3
THE TRUSTEES CONTINUALLY CONSIDER THE ONGOING POSITION OF THE FARM LAND, PARTICULARLY IN VIEW OF POSSIBLE DEVELOPMENT., - - - - TOTAL INCOMING RESOURCES  52,119 - 52,119 43,051    __ _ ________ _____ __ ___ ________
OBJECTIVES AND AIMS THE CHARITY IS GOVERNED BY A DEED OF TRUST DATED 21ST FEBRUARY 1992.

Short achievements_performance sections: 1
CHARITABLE ACTIVITIES DURING THE YEAR THE CHARITY MADE CHARITABLE GRANTS TOTALLING £13,374 (2022: £15,530).

Short grant_policy sections: 10
POLICY ON SOCIAL INVESTMENT INCLUDING PROGRAM RELATED INVESTMENT, GRANTS ARE MADE TO CHARITABLE INSTITUTIONS, ORGANIZATIONS AND AUTHORIZED INDIVIDUALS WHICH ACCORD WITH THE OBJECTS OF THE CHARITY.
GRANTS ARE MADE TO CHARITABLE INSTITUTIONS AND ORGANIZATIONS WHICH ACCORD WITH THE OBJECTS OF THE CHARITY.

Exploration of Findings from Word Count Analysis (Funders)

Sections from API

Having manually checked the maximum values against their Charity Commission records, I am confident that there are no entries that are concerningly long. The top 10 longest activities are not abnormal; they are just relatively wordy compared to others. I viewed the shortest values (fewer than five words) and concluded that they are acceptable, although I will reassign the values containing “NONE” to N/A to improve the quality of the embedding.

Sections from Accounts

The word count analysis has highlighted that there are numerous issues with the sections that have been extracted from accounts. Many that are too long contain multiple sections, whereas many that are too short have been cut off prematurely. There is also a problem of repetition - the extractor sometimes extracts the same (excessively long) chunk of text for both objectives_activities and achievements_performance, defeating the purpose. Looking at the top 10 shortest and longest has demonstrated that the regex extraction technique employed in the database builder has not worked very well. Rather than rebuilding the entire PDF extraction pipeline, I will address this data quality issue by re-processing the extracted text through an LLM-based extractor with the aim of better handling the variation in section sizes and formatting. The LLM should be better at the fuzzy matching that regex has struggled with.

Analysis of Word Counts in Grants Dataframe

#check word counts for text columns in grants df
grants_text_cols = ["grant_title", "grant_desc", "recipient_activities"]

#create columns
for col in grants_text_cols:
    grants_df[f"word_count_{col}"] = grants_df[col].str.split().str.len()
for col in grants_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = grants_df[grants_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = grants_df.loc[[min_idx, max_idx], ["funder_num", "funder_name", "grant_id", "recipient_name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")

GRANT_TITLE
______________________________

	funder_num	funder_name	grant_id	recipient_name	word_count_grant_title	grant_title
Minimum	1125537	PEOPLE'S HEALTH TRUST	360G-PeoplesHealthTrust-2015_153	BEAT ROUTES	1.00	#LEVELS
Maximum	263207	JOHN ELLERMAN FOUNDATION	360G-Ellerman-18741	MORE IN COMMON	38.00	MORE IN COMMON WILL PROVIDE SUPPORT FOR THE FO...

GRANT_DESC
______________________________

	funder_num	funder_name	grant_id	recipient_name	word_count_grant_desc	grant_desc
Minimum	263207	JOHN ELLERMAN FOUNDATION	360G-Ellerman-17024	CLORE SOCIAL LEADERSHIP PROGRAMME	1.00	BURSARIES
Maximum	1091263	LONDON COMMUNITY FOUNDATION	360G-LondonCF-A583241	ACLE EVENING WOMEN'S INSTITUTE	259.00	MAJORITY OF OUR SERVICE USERS ARE FROM BME COM...

RECIPIENT_ACTIVITIES
______________________________

	funder_num	funder_name	grant_id	recipient_name	word_count_recipient_activities	recipient_activities
Minimum	283813	LONDON MARATHON CHARITABLE TRUST LIMITED	360G-LMCT-68_2023	EDMONTON COMMUNITY PARTNERSHIP	1.00	.
Maximum	1091263	LONDON COMMUNITY FOUNDATION	360G-LondonCF-A561932	LEWISHAM MULTILINGUAL ADVICE SERVICE	500.00	LEWISHAM MULTILINGUAL ADVICE SERVICE (LMLAS) I...

#low word counts - count and view short grant details from 360Giving API/CC public extract
grant_details = {
    "grant_title": "Short grant title",
    "grant_desc": "Short grant description",
    "recipient_activities": "Short recipient activities"
}

for col, title in grant_details.items():
    grants_df[f"short_{col}_section"] = (
        (grants_df[col].str.split().str.len() < 3)
    )
    short_grant_details = grants_df[grants_df[f"short_{col}_section"]][col].dropna().unique()
    
    # print(f"\n{title}: {grants_df[f'short_{col}_section'].sum():,}")
    # print_in_rows(short_grant_details, 12)

#high word counts - check top 10 longest grant details (to manually check)
long_grant_title = get_longest_values(grants_df, "word_count_grant_title", "grant_id")
long_grant_desc = get_longest_values(grants_df, "word_count_grant_desc", "grant_id")
long_recip_activities = get_longest_values(grants_df, "word_count_recipient_activities", "grant_id")

sections = [
    ("grant_title", long_grant_title),
    ("grant_desc", long_grant_desc),
    ("recipient_activities", long_recip_activities)
]

for name, longest in sections:
    print(f"Top 10 longest {name}: {longest}")

Top 10 longest grant_title: ['360G-Ellerman-18741', '360G-Ellerman-18295', '360G-Ellerman-17745', '360G-Ellerman-18026', '360G-LMCT-07_2021', '360G-Ellerman-20290', '360G-Ellerman-20116', '360G-Ellerman-19350', '360G-Ellerman-18035', '360G-LMCT-18_2015']
Top 10 longest grant_desc: ['360G-LondonCF-A583241', '360G-PeoplesHealthTrust-2023_137', '360G-PeoplesHealthTrust-2023_39', '360G-PeoplesHealthTrust-2023_24', '360G-PeoplesHealthTrust-2014_176', '360G-PeoplesHealthTrust-2016_200', '360G-PeoplesHealthTrust-2023_25', '360G-PeoplesHealthTrust-2023_14', '360G-PeoplesHealthTrust-2014_192', '360G-PeoplesHealthTrust-2013_41']
Top 10 longest recipient_activities: ['360G-LondonCF-A584528', '360G-LondonCF-A561932', '360G-LondonCF-A584264', '360G-LondonCF-A575782', '360G-LondonCF-A566462', '360G-LondonCF-A585091', '360G-LondonCF-A583159', '360G-LondonCF-A567136', '360G-LondonCF-A561466', '360G-LondonCF-A565064']

Exploration of Findings from Word Count Analysis (Grants)

The word count analysis on the grants dataframe shows that there are almost 2,000 entries with a length of fewer than three words. Having inspected these, I can see that there are a small number of unusual values that need to be cleaned, such as an entry in recipient_activities that reads #NAME?. There are also several hundred grant titles that appear to be IDs as opposed to descriptive titles (e.g. VFTF R6-SOLACE-2024); these will not lend themselves to embedding, so I will remove them. They all appear to end in a dash and a year, so can be identified by “-2”.

In terms of high word counts, there are no concerns. As with the information called from the API in the funders dataframe, the longest grant details are simply relatively long and wordy. I am confident that the information from the 360Giving API is highly reliable, with a very small minority of unexpected values that can be cleaned easily.

3. Quality of Data from APIs

Although the data that has been returned from the APIs is generally clean and reliable, there are some minor issues that require tidying.

Problems Identified by Word Count Analyses

#replace ID grant titles with None
grants_df.loc[grants_df["grant_title"].str.contains("-2", na=False), "grant_title"] = None

#replace unusual values with None
unusual_values = ["#NAME?", "NIL ACTIVITY.", ".", "AS BEFORE", "IN LIQUIDATION", "NONE", "0", "N/A", "-"]
grants_df["grant_desc"] = grants_df["grant_desc"].replace(unusual_values, None)
grants_df["recipient_activities"] = grants_df["recipient_activities"].replace(unusual_values, None)

#remove html entities from activities
grants_df["recipient_activities"] = grants_df["recipient_activities"].str.replace("^&#X2;", "", regex=True, case=False)

#clean start of activities strings
grants_df["recipient_activities"] = grants_df["recipient_activities"].str.replace("^[^a-zA-Z0-9]+", "", regex=True)

#print grant details again to confirm cleaning has been successful
grant_details = {
    "grant_title": "Short grant title",
    "grant_desc": "Short grant description",
    "recipient_activities": "Short recipient activities"
}

for col, title in grant_details.items():
    grants_df[f"short_{col}_section"] = (
        (grants_df[col].str.split().str.len() < 3)
    )
    short_grant_details = grants_df[grants_df[f"short_{col}_section"]][col].dropna().unique()
    
    # print(f"\n{title}: {grants_df[f'short_{col}_section'].sum():,}")
    # print_in_rows(short_grant_details, 12)

Checking and Cleaning Other Variables (Funders)

#check funder names
name_lengths = funders_df["name"].str.len()

print(f"Funders with names < 10 characters: {(name_lengths < 10).sum()}")
print(f"Funders with names > 75 characters: {(name_lengths > 75).sum()}")

print("\nTop 10 shortest funder names (< 10 characters)")
short_names = funders_df[name_lengths < 10][["registered_num", "name", "website"]].sort_values("name", key=lambda x: x.str.len()).head(10)
display(short_names)

print("\nTop 10 longest funder names (> 75 characters)")
long_names = funders_df[name_lengths > 100][["registered_num", "name", "website"]].sort_values("name", key=lambda x: x.str.len(), ascending=False)
display(long_names)

Funders with names < 10 characters: 23
Funders with names > 75 characters: 1

Top 10 shortest funder names (< 10 characters)

	registered_num	name	website
156	1160614	STAR	https://www.starfoundation.org.uk
416	1161405	SEDCU	https://www.sedcu.org.uk
559	1187731	JOSHA	None
764	1082476	PRAGYA	https://www.pragya.org
736	1172017	CLYMER	None
578	1197149	RESEED	https://reseed.org.uk
517	1087870	CMZ LTD	None
783	1147360	LIFT UK	https://www.liftuk.org
609	1190525	RHIZA UK	https://rhizauk.uk/
897	1041449	MIRACLES	https://www.miraclesthecharity.org


Top 10 longest funder names (> 75 characters)

	registered_num	name	website
904	220487	CHARITY FOR THE REPARATION OF THE CHURCH AND S...	None

#check funder websites
website_lengths = funders_df["website"].str.len()

print(f"Funders with websites < 15 characters: {(website_lengths < 15).sum()}")
print(f"Funders with websites > 75 characters: {(website_lengths > 75).sum()}")

print("\nTop 10 shortest funder websites (< 15 characters)")
short_websites = funders_df[website_lengths < 15][["registered_num", "name", "website"]].sort_values("website", key=lambda x: x.str.len()).head(10)
display(short_websites)

print("\nTop 10 longest funder websites (> 75 characters)")
long_websites = funders_df[website_lengths > 75][["registered_num", "name", "website"]].sort_values("website", key=lambda x: x.str.len(), ascending=False)
display(long_websites)

Funders with websites < 15 characters: 6
Funders with websites > 75 characters: 2

Top 10 shortest funder websites (< 15 characters)

	registered_num	name	website
839	801622	SUSAN AND STEPHEN JAMES CHARITABLE SETTLEMENT
616	1126336	ANAV CHARITABLE TRUST
767	1069996	SUPPORT TIFFIN GIRLS' SCHOOL COMPANY
266	246116	CLERGY WIDOWS' AND ORPHANS' FUND
272	1138049	PERU PEOPLE
159	287404	LIBRARY SERVICES TRUST	https://L.S.T


Top 10 longest funder websites (> 75 characters)

	registered_num	name	website
955	200693	EVERSHOLT PAROCHIAL CHARITY	https://www.eversholtvillage.co.uk/history-fac...
51	273057	WYMONDHAM COLLEGE PARENT-STAFF ASSOCIATION	https://www.wymondhamcollege.org/home/partners...

#reassign null to incorrectly stored urls
funders_df.loc[website_lengths < 15, "website"] = None

#remove leading characters from paragraph columns
funders_df["activities"] = funders_df["activities"].apply(clean_start_of_text)
funders_df["objectives"] = funders_df["objectives"].apply(clean_start_of_text)

Checking and Cleaning Other Variables (Grants)

#check recipient ids - view ids with unexpected starting characters
invalid_ids = grants_df[~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P"), na=False)]["recipient_id"].unique()
# print(list(invalid_ids))

#store original ids
grants_df["original_recipient_id"] = grants_df["recipient_id"].copy()

#remove incorrect spaces and 0s
grants_df["recipient_id"] = grants_df["recipient_id"].astype(str).str.strip()
grants_df["recipient_id"] = grants_df["recipient_id"].str.lstrip("0")

#reassign remaining invalid ids
invalid_ids = ~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P"), na=False)
grants_df.loc[invalid_ids, "recipient_id"] = [f"invalid_{i}" for i in range(invalid_ids.sum())]

#map old and new ids
id_mapping = grants_df[grants_df["recipient_id"] != grants_df["original_recipient_id"]][["original_recipient_id", "recipient_id"]].drop_duplicates()

#check result
invalid_ids = grants_df[~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P", "i"), na=False)]["recipient_id"].unique()
print(f"Invalid IDs remaining: {list(invalid_ids)}")

Invalid IDs remaining: []

#remove leading characters from paragraph columns
grants_df["recipient_activities"] = grants_df["recipient_activities"].apply(clean_start_of_text)

Explanation of Cleaning Process

Funder Names and Websites

I am satisfied that there are no concerns with the length of funder names. Looking at the shortest websites, there appeared to be one invalid URL, and five that are stored in an incorrect null format; I reassigned all six of these to None.

Recipient IDs

Some values in recipient_id need to be cleaned. There is a small subset with incorrect starting characters, and several that are not registered charity numbers. Working with organisations that are not registered with the Charity Commission (of England & Wales) is out of the scope of this project. Where there is clearly an error in the Charity Commission registered number, I have cleaned these values. I have removed spaces and 0s from the start of IDs and reassigned any other values (e.g. N/A or Exempt) to show that the ID was invalid. Where recipient_id is a company number or a registered number from Scotland or Northern Ireland, I have also assigned an “invalid” ID. The text content of these records can be embedded as-is, but anything else (e.g. calling a different API) is out of the scope of the project so no additional value will be gleaned from keeping the original IDs.

4. Quality of Extracted Sections and Grants

Sections

To extract sections (i.e activities_objectives, achievements_performance and grants_policy), I have relied on regex matching. Unfortunately, as observed during the Word Count Analysis, there are noteable errors and quality issues that would degrade the quality of embeddings and undermind the reliability of semantic similarity comparisons. As such, they cannot be ignored. The grants have been extracted using the Claude API (using the Haiku 3 model), which has resulted in some errors but not to the same extent, and not as irreperable. I will reprocess the PDFs by running them through the Claude API, to improve the chances of the sections being correctly and consistently extracted.

#load reprocessed sections
sections_df = pd.read_csv("./6.2_sections_reprocessor/llm_sections.csv")
sections_df.head()

	registered_num	year_processed	objectives_activities	achievements_performance	grant_policy	error
0	272161	2024	The trustees are directed by the trust deed to...	The Trust generated £261,918 (2023: £259,731) ...	The published policy of the trustees is to hel...	NaN
1	211715	2024	The object of the Trust is to relieve persons ...	During the year the Grants Sub-Committee revie...	The Trustees consider applications for grants ...	NaN
2	1007307	2024	The objective of the charity is to provide fin...	NaN	NaN	NaN
3	235891	2024	The charity is controlled by its governing doc...	NaN	NaN	NaN
4	803686	2025	The charity is a non-profit seeking charitable...	The charity's achievements during the period i...	When considering applications for grants, the ...	NaN

#check efficacy of sections reprocessor
sections_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   registered_num            295 non-null    int64 
 1   year_processed            295 non-null    int64 
 2   objectives_activities     276 non-null    object
 3   achievements_performance  263 non-null    object
 4   grant_policy              154 non-null    object
 5   error                     5 non-null      object
dtypes: int64(2), object(4)
memory usage: 14.0+ KB

#view 10% random selection of each reprocessed section
section_columns = ["objectives_activities", "achievements_performance", "grant_policy"]

for col in section_columns:
    #get sample of sections with values
    non_null = sections_df[sections_df[col].notna()]
    sample_size = int(len(non_null) * 0.1)
    sample = non_null.sample(n=sample_size, random_state=42)

    #display sections to spotcheck
    print(f"\n{col.upper()}")
    print(f"Total: {len(non_null)}\nSample: {sample_size}\n")

    # for idx, (i, row) in enumerate(sample.iterrows(), 1):
    #     print(f"---Sample {idx} (Funder {row['registered_num']})---")
    #     print(row[col])
    #     print()


OBJECTIVES_ACTIVITIES
Total: 276
Sample: 27


ACHIEVEMENTS_PERFORMANCE
Total: 263
Sample: 26


GRANT_POLICY
Total: 154
Sample: 15

#drop sections columns from funder df
sections_cols = ["objectives_activities", "achievements_performance", "grant_policy"]
funders_df = funders_df.drop(columns=sections_cols, errors="ignore")
sections_df["registered_num"] = sections_df["registered_num"].astype(str)

#make sections uppercase
for col in sections_cols:
    sections_df[col] = sections_df[col].str.upper()

#merge reprocessed sections back into funders df
funders_df = funders_df.merge(
    sections_df[["registered_num"] + sections_cols],
    on="registered_num",
    how="left"
)

#confirm merge
print(f"objectives_activities: {funders_df['objectives_activities'].notna().sum()}, \
achievements_performance: {funders_df['achievements_performance'].notna().sum()}, \
grant_policy: {funders_df['grant_policy'].notna().sum()}")

objectives_activities: 276, achievements_performance: 263, grant_policy: 154

Exploration of Reprocessed Sections

The output from the .info() method shows that more funders now have values for the extracted sections - increasing from 52 to 154 funders with grant_policy, from 193 to 263 funders with achievements_peformance and from 229 to 276 funders with objectives_activities. The random spotcheck of 10% of the values for each section has revealed that the quality is much higher. Other than some small spelling errors (e.g. Uttar Grades instead of Uttar Pradesh), there do not seem to be extensive quality issues in these sections that could severely impact the quality of embeddings. The fact that I have noted issues with extracted grants (which have only ever been extracted using Claude) demonstrates that using an LLM to extract text from PDFs cannot be relied upon as a perfect method; however, given its substantial improvement over regex matching (and the absence of more robust extraction methods within the project’s constraints), I consider it to be the most appropriate solution available.

Grants - Names

The messiest part of the extracted grants data is recipient_name, which has been built from two sources. The first is the Charity Commission’s Public Extract, which is updated daily and includes registered charities’ names as logged with the Charity Commission. These names are generally clean and standardised, though charities occasionally use trading names that differ from their official registered names.

The second source is the organisation’s name as extracted from funders’ accounts by the Claude API. This is significantly less reliable and is a noted limitation of this project. The reliability is affected in part by the reliance on a large language model and its fuzzy intepretations of messy text (although this is clearly preferable to regex), but also due to the fact that funders’ accounts are not held to any standard of enforcement in terms of the correct spelling, uniformity, or indeed accuracy of recipients’ names. Examples that have been noted during the course of this project include inconsistency in pluralisation and spacing (e.g. hospices vs hospice, Tearfund vs Tear Fund); missing words (e.g. British Red Cross Society vs British Red Cross); errors in punctuation (e.g. soldiers’ vs solider’s); and inconsistency in word order (e.g. Lincoln University vs University of Lincoln). These inconsistencies mean that grants to the same recipient may appear under multiple name variants in the database. However, as the current system focuses on funder-applicant alignment rather than recipient-level analysis these naming inconsistencies will not impact the core functionality and this remains an area for future development.

Human Recipients of Grants

As some funders give grants to individuals as well as organisations, Claude API has extracted some human names from the accounts. This is publicly available data but, even so, I do not feel comfortable storing people’s names in my database - plus, it will add no value to the project. As such, I will attempt to flag the human records for removal.

#flag records that might be human names
extracted_recipients = grants_df[grants_df["recipient_id"].str.startswith("PDF", na=False)].copy()
extracted_recipients["is_potential_human"] = extracted_recipients["recipient_name"].apply(check_names)
flagged_as_human = extracted_recipients[extracted_recipients["is_potential_human"]]["recipient_name"].unique()

print(f"Total unique extracted recipients: {extracted_recipients['recipient_name'].nunique():,}")
print(f"Names flagged as potentially human: {len(flagged_as_human):,}")

#add as column to grants_df
grants_df["is_potential_human"] = False
grants_df.loc[grants_df["recipient_id"].str.startswith("PDF", na=False), "is_potential_human"] = \
    grants_df.loc[grants_df["recipient_id"].str.startswith("PDF", na=False), "recipient_name"].apply(check_names)

Total unique extracted recipients: 6,865
Names flagged as potentially human: 552

grants_df.sort_values("is_potential_human", ascending=False).head(2)

	grant_title	grant_desc	amount	year	grant_id	source	funder_num	funder_grants_id	funder_name	recipient_id	recipient_grants_id	recipient_name	recipient_activities	recipient_areas	recipient_causes	recipient_beneficiaries	amount_gbp	word_count_grant_title	word_count_grant_desc	word_count_recipient_activities	short_grant_title_section	short_grant_desc_section	short_recipient_activities_section	original_recipient_id	is_potential_human
20615	None	None	1218.04	2023	2023_226743_24	Accounts	226743	59728	KNARESBOROUGH RELIEF IN NEED CHARITY	PDF-000185	58539	AMANDA AGNEW	None	[]	[]	[]	£1,218.04	NaN	NaN	NaN	False	False	False	PDF-000185	True
20619	None	None	921.50	2023	2023_226743_28	Accounts	226743	59732	KNARESBOROUGH RELIEF IN NEED CHARITY	PDF-000189	58543	LAURA ANDERSON	None	[]	[]	[]	£921.50	NaN	NaN	NaN	False	False	False	PDF-000189	True

#remove records
grants_df = grants_df[~((grants_df["funder_name"] == "KNARESBOROUGH RELIEF IN NEED CHARITY") &
                          (grants_df["is_potential_human"] == True))]

#delete rows where recipient_name starts with a number
grants_df = grants_df[~grants_df["recipient_name"].str.match(r'^\d', na=False)]

Grants - Duplication

Charity accounts follow a comparative reporting format where they present figures for both the current financial year and the previous financial year side by side. This means that the grants that have been extracted by the LLM are likely to be duplicated in some cases. I will identify duplicates by comparing the funder, recipient, grant amount and years. Where the exact same funder-recipient-amount combination is found in two consecutive years this is likely to be a duplicate, so I will remove these instances.

#sort grants extracted from accounts to spot duplicates
accounts_grants = grants_df[grants_df["source"] == "Accounts"].copy()
accounts_grants = accounts_grants.sort_values(["funder_num", "recipient_name", "amount", "year"])

#compare via shifted column
accounts_grants["prev_funder"] = accounts_grants["funder_num"].shift(1)
accounts_grants["prev_recipient"] = accounts_grants["recipient_name"].shift(1)
accounts_grants["prev_amount"] = accounts_grants["amount"].shift(1)
accounts_grants["prev_year"] = accounts_grants["year"].shift(1)

#mark as duplicate is same funder, recipient and amount but one year apart
accounts_grants["is_consecutive_dup"] = (
    (accounts_grants["funder_num"] == accounts_grants["prev_funder"]) &
    (accounts_grants["recipient_name"] == accounts_grants["prev_recipient"]) &
    (accounts_grants["amount"] == accounts_grants["prev_amount"]) &
    ((accounts_grants["year"] - accounts_grants["prev_year"]) == 1)
)

print(f"Found {accounts_grants['is_consecutive_dup'].sum()} consecutive year duplicates")

Found 1820 consecutive year duplicates

#remove duplicated grants and combine back with other grants
accounts_deduped = accounts_grants[~accounts_grants["is_consecutive_dup"]].drop(
    columns=["prev_funder", "prev_recipient", "prev_amount", "prev_year", "is_consecutive_dup"]
)
g360_grants = grants_df[grants_df["source"] == "360Giving"]
grants_df = pd.concat([g360_grants, accounts_deduped], ignore_index=True)

print(f"Removed {len(accounts_grants) - len(accounts_deduped)} duplicate grants")
print(f"New total grants: {len(grants_df):,}")

Removed 1820 duplicate grants
New total grants: 30,955

Grants - Amounts

As explored during the missingness analysis, some amounts are null and I have chosen to retain these. I will also sense check very small and very large amounts to confirm that they are indeed correct. I will exclude grants with values of £0 because these are typically accounting adjustments, regranted funds, or administrative entries rather than nulls or actual grant awards (360Giving, n.d.).

#get grants less than £100 except where this is expected
low_value_grants = grants_df[(grants_df["amount"] < 100) & (grants_df["amount"] > 0) & \
    (~grants_df["funder_name"].str.startswith("ROTARY", na=False)) & \
        (~grants_df["funder_name"].str.startswith("JAMES", na=False))]

#multiply by 1000 where relevant
multiply_000s_funders = ["1060133", "255281"]
for fun_num in multiply_000s_funders:
    grants_df.loc[(grants_df["funder_num"] == fun_num) & (grants_df["amount"] < 100), "amount"] = \
        grants_df.loc[(grants_df["funder_num"] == fun_num) & (grants_df["amount"] < 100), "amount"] * 1000

#reassign where amount is incorrect
grants_df.loc[(grants_df["funder_num"] == "298083") & (grants_df["amount"] == 1), "amount"] = None
grants_df.loc[(grants_df["grant_id"] == "2023_1045479_1"), "amount"] = 278.00

#recalculate gbp amounts
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)

#drop rows with human names
grants_df = grants_df[(grants_df["recipient_id"] != "PDF-000179") & (grants_df["recipient_id"] != "PDF-000180")]

low_value_cleaned = grants_df[(grants_df["amount"] < 100) & (grants_df["amount"] > 0) & \
    (~grants_df["funder_name"].str.startswith("ROTARY", na=False)) & \
        (~grants_df["funder_name"].str.startswith("JAMES", na=False))]

low_value_cleaned.head(5)

	grant_title	grant_desc	amount	year	grant_id	source	funder_num	funder_grants_id	funder_name	recipient_id	recipient_grants_id	recipient_name	recipient_activities	recipient_areas	recipient_causes	recipient_beneficiaries	amount_gbp	word_count_grant_title	word_count_grant_desc	word_count_recipient_activities	short_grant_title_section	short_grant_desc_section	short_recipient_activities_section	original_recipient_id	is_potential_human
18912	None	None	14.00	2021	2021_1045479_1	Accounts	1045479	64186	BARKER-MILL FOUNDATION	PDF-002489	62997	PAROCHIAL CHURCH COUNCIL OF THE ECCLESIASTICAL...	None	[]	[]	[]	£14.00	NaN	NaN	NaN	False	False	False	PDF-002489	False
22513	None	None	30.00	2020	2020_222652_14	Accounts	222652	62270	ROCHDALE FUND FOR RELIEF IN SICKNESS	PDF-001568	61081	INJURED JOCKEY'S FUND	None	[]	[]	[]	£30.00	NaN	NaN	NaN	False	False	False	PDF-001568	False
22635	None	None	51.00	2021	2021_223903_18	Accounts	223903	62780	CLEEVES AND WHITEHEAD TRUST	PDF-001805	61591	CARL WHITEHEAD LODGE	None	[]	[]	[]	£51.00	NaN	NaN	NaN	False	False	False	PDF-001805	False
22842	None	None	25.00	2021	2021_224590_1	Accounts	224590	62504	SHAW LANDS TRUST	PDF-001638	61315	BARNSLEY & DISTRICT GENERAL HOSPITAL	None	[]	[]	[]	£25.00	NaN	NaN	NaN	False	False	False	PDF-001638	False
22843	None	None	35.00	2020	2020_224590_1	Accounts	224590	62544	SHAW LANDS TRUST	PDF-001638	61355	BARNSLEY & DISTRICT GENERAL HOSPITAL	None	[]	[]	[]	£35.00	NaN	NaN	NaN	False	False	False	PDF-001638	False

Explanation of Grants Cleaning Process

There are 6,865 grants in the database that have been extracted from PDFs, but using the names dataset I was able to filter this to 552 flagged as potentially containing human names. I could immediately see that the main culprit was a funder called KNARESBOROUGH RELIEF IN NEED CHARITY, which has named its individual recipients in its accounts. For the sake of reproducibility, I will programmatically remove records from this funder and with only two words - but for a more thorough removal it would be best to inspect each PDF-extracted grant record manually.

The deduplication process keeps the earliest year of each grant, as the earlier accounts show the actual grant, and later accounts are just showing “prior year” comparisons. In terms of functionality, this means that prospie will be able to inform the user, for example, ‘this funder gave £1,000 to XYZ Charity in 2023’.

Regarding cleaning the grants amounts data, I checked for values lower than £100 as this is a relatively unusual value for a grant. This showed that some funders (such as Rotary Clubs) can be expected to give low value grants, as this is how their distribution model works. Around 60% of the remaining low value were attributable to grants being reported in thousands of pounds (£000s) in the original accounts, e.g. a grant of £22,000 was recorded as £22, so I multiplied these values by 1000. The last transformation on grant amounts was for one funder whose accounts included the names of grant recipients but not values, and the LLM had mistakenly assigned an incorrect value of £1 to these grants. A spotcheck of the remaining low value suggests that these are accurate, with 0.14% of the grants in the dataset having a value of less than £100.

Descriptive Statistics

Summary Statistics

#build df of summary statistics
summary_data = make_summary_df(funders_df, grants_df)
summary_df = pd.DataFrame(summary_data)
summary_df["Value"] = summary_df.apply(format_stats, axis=1)
summary_df = format_df(summary_df)

Metric	Value
Total funders	996
Funders with grants	309
Funders with grants from 360Giving	11
Total recipients	19,466
Total grants	30,716
Total grant value	£1,428,662,939.67
Mean grants per funder	106
Most grants given by a funder	6,987
Fewest grants given by a funder	1
Mean recipients per funder	72.9
Mean areas per funder	3.6
Mean funder income	£1,258,942.79
Median funder income	£46,068.00
Mean funder expenditure	£1,188,764.68
Median funder expenditure	£43,765.50
Largest funder by mean income	Save The Children Fund (£284,868,800.00)
Largest funder by mean expenditure	Save The Children Fund (£281,939,194.60)
Mean grants per recipient	1.6
Range of grants received	1 to 79
Recipient of largest grant	Patricia, Donald, Christine & Patrick Shepherd Foundation (£11,182,253.00) From Patricia And Donald Shepherd Charitable Trust
Mean grant size	£46,587.85
Median grant size	£11,791.50
Standard deviation	£148,858.47
Smallest grant	£14.00
Largest grant	£11,182,253.00
Earliest grant	2001

Calculated Statistics

#get calculated stats and build df
stats = calculate_stats(funders_df, grants_df)
calculated_data = make_calculated_df(stats)
calculated_df = pd.DataFrame(calculated_data)
calculated_df["Value"] = calculated_df.apply(format_stats, axis=1)
calculated_df = format_df(calculated_df)

Metric	Value
Share of grants from top 10% funders (by income)	71.6%
Share of grants to top 10% recipients (by grant value)	71.5%
Percent of recipients with multiple grants from same funder	21.5%
Average grants per funder-recipient pair	1
Mean grants-to-income ratio	119.6%
Median grants-to-income ratio	57.6%
Percent of funders supporting General Charitable Purposes only	16.8%
Percent of extracted grants matched with recipient data	23.2%

Exploration of Giving Patterns and Funder Characteristics

1. Categorical Analysis

Classification Popularity

One of the key objectives of prospie is to help fundraisers navigate the confusing trusts landscape, which is confusing largely due to the mismatch of information that is available. Funders may indicate a particular cause or area of activity, but these do not always align with their actual giving habits. I will therefore compare the classifications that funders state (their identified causes, beneficiaries, and areas of activity) with those of the recipients who are awarded their grants.

#display plots to compare funders' and recipients' classifications
fig, axes = plt.subplots(3, 2, figsize=(18, 15))

#areas
make_bar_chart(grants_df, "recipient_areas", "Recipients: Most Popular Areas", color=colours[1], ax=axes[1, 0])
make_bar_chart(funders_df, "areas", "Funders: Most Popular Areas", color=colours[1], ax=axes[1, 1])

#beneficiaries
make_bar_chart(grants_df, "recipient_beneficiaries", "Recipients: Most Popular Beneficiaries", color=colours[2], ax=axes[2, 0])
make_bar_chart(funders_df, "beneficiaries", "Funders: Most Popular Beneficiaries", color=colours[2], ax=axes[2, 1])

#causes
make_bar_chart(grants_df, "recipient_causes", "Recipients: Most Popular Causes", color=colours[0], ax=axes[0, 0])
make_bar_chart(funders_df, "causes", "Funders: Most Popular Causes", color=colours[0], ax=axes[0, 1])

plt.tight_layout()
plt.subplots_adjust(hspace=0.4)
plt.show()

Relationships between Categories

#explode classification columns
exploded_classifications = grants_df[["recipient_causes", "recipient_beneficiaries"]].copy()
exploded_classifications = exploded_classifications.explode("recipient_causes").explode("recipient_beneficiaries")
exploded_classifications = exploded_classifications.dropna()

#create crosstab
crosstab = pd.crosstab(exploded_classifications["recipient_causes"], exploded_classifications["recipient_beneficiaries"])

#create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(crosstab, cmap="Oranges", cbar_kws={"label": "Number of Grants"})
plt.title("Causes vs Beneficiaries Heatmap", fontsize=14, fontweight="bold")
plt.xlabel("Beneficiaries")
plt.ylabel("Causes")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Diversity of Giving Across Categories

#count unique categories per funder
funder_diversity = grants_df.groupby("funder_num").agg({
    "recipient_areas": lambda x: len(set([a for areas in x for a in areas])),
    "recipient_beneficiaries": lambda x: len(set([b for bens in x for b in bens])),
        "recipient_causes": lambda x: len(set([c for causes in x for c in causes]))
}).rename(columns={"recipient_areas": "unique_areas",
                    "recipient_beneficiaries": "unique_beneficiaries",
                    "recipient_causes": "unique_causes",})

#create histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

categories = [
    ("unique_areas", "Areas", colours[1]),
    ("unique_beneficiaries", "Beneficiaries", colours[2]),
    ("unique_causes", "Causes", colours[0])
]

for idx, (col, label, colour) in enumerate(categories):
    #remove recipients with missing classifications
    diversity_filtered = funder_diversity[funder_diversity[col] > 0]
    
    if col == "unique_areas":
        axes[idx].hist(diversity_filtered[col], bins=20, color=colour)
    else:
        value_counts = diversity_filtered[col].value_counts().sort_index()
        axes[idx].bar(value_counts.index, value_counts.values, color=colour)
        axes[idx].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    
    axes[idx].set_title(f"Diversity of Giving - {label}")
    axes[idx].set_xlabel(f"Number of Unique {label} Funded")
    axes[idx].set_ylabel("Number of Funders")
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

Categorical Match Rates

#add funder classifications to grants_df
grants_funders_merge = grants_df.merge(
    funders_df[["registered_num", "causes", "areas", "beneficiaries"]],
    left_on="funder_num",
    right_on="registered_num",
    how="left",
    suffixes=("", "_funder")
)

#find overlaps to identify matches (only where recipients have classifications)
for category in ["areas", "beneficiaries", "causes"]:
    grants_funders_merge[f'{category}_match'] = grants_funders_merge.apply(lambda row: check_overlap(row[category], row[f'recipient_{category}']), axis=1)

with_classifications = grants_funders_merge[
    (grants_funders_merge["recipient_areas"].str.len() > 0) &
    (grants_funders_merge["recipient_beneficiaries"].str.len() > 0) &
    (grants_funders_merge["recipient_causes"].str.len() > 0)
]

#calculate funder match rates
funder_match_rates = with_classifications.groupby("funder_num").agg({
    "areas_match": ["sum", "mean"],
    "beneficiaries_match": ["sum", "mean"],
    "causes_match": ["sum", "mean"],
    "grant_id": "count"
}).reset_index()

funder_match_rates.columns = ["funder_num", "areas_matches", "areas_match_rate", "beneficiaries_matches", "beneficiaries_match_rate", "causes_matches", "causes_match_rate", 
"total_grants"]

#add names
funder_match_rates = funder_match_rates.merge(
    funders_df[["registered_num", "name"]],
    left_on="funder_num",
    right_on="registered_num"
)

#create histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, category in enumerate(["areas", "beneficiaries", "causes"]):
    col = f"{category}_match_rate"
    mean_val = funder_match_rates[col].mean()

    axes[idx].hist(funder_match_rates[col], bins=20, color=colours[idx])
    axes[idx].axvline(mean_val, color="red", linestyle="--", linewidth=2,
                    label=f"Mean: {mean_val:.1%}")
    axes[idx].set_xlabel("Match Rate")
    axes[idx].set_ylabel("Number of Funders")
    axes[idx].set_title(f"{category.title()} Match Rate Distribution")
    axes[idx].legend()

plt.tight_layout()
plt.show()

Exploration of Findings from Categorical Analysis

It can be observed that funders often state an interest in general causes, areas and beneficiaries - many do not specify particular interests and state that they will consider applications from any area of the sector. Further analysis will be useful, particularly following the creation of embeddings, to understand the practical reality of their funding priorities, which may reveal implicit preferences or local biases not reflected in their published criteria, e.g. if ‘in reality, donations are completely earmarked for a few select beneficiaries’ (Lock and Robinson, 2023).

Whilst the heatmap shows the relationship between causes and beneficiaries across all grants, it is heavily skewed by high-volume funders like Esmée Fairbairn Foundation (6,000+ grants). This means that the patterns shown reflect individual funder preferences rather than sector-wide trends, so it does not offer us much useful insight or relevance to the alignment model. It would be more useful to analyse funder behaviour on an individual level, such as quantifying how well their stated preferences match with their actual giving.

Most funders concentrate their giving within a relatively narrow scope. There’s a long tail of funders who spread their giving more widely, but these are the exception. This clustering suggests that funders typically have quite focused areas of interest, even whe they state broad ranges in their registered details. I was unable to find an explicit explanation of the huge peak at 7 causes supported, but evidence suggests that Charity Commission classifications may have previously been limited to seven general causes, before a wider range of more precise causes were implemented at a later date.

Match rates show an interesting pattern - causes has the highest match rate at around 70%, followed by beneficiaries at 61%, whilst areas sits lowest at 45%. However, this doesn’t mean causes are more important - it’s actually the opposite. Causes are often incredibly generic (many charities state that they work in vague areas such as ‘education’ or ‘health’), so high match rates don’t tell us much. Areas, on the other hand, can be specific right down to town level, so a match there is much more meaningful. This has important implications for the weighting - areas need to be weighted more highly to reflect their specificity, and keyword matching will need careful calibration to avoid over-scoring when generic terms like ‘education’ appear on both sides.

A limitation of undertaking the analysis at this stage is missing recipient data; only 23.2% of grants that have been extracted from accounts are matched with a recipient (with data added by the recipients_table_builder). This means that around 3/4 of recipients do not have classifications data to analyse. This analysis will likely be more insightful once the embeddings data is available and semantic analysis can be performed on recipient_name and funders’ details.

2. Temporal and Financial Analysis

There is a disparity in the number of grants from each data source (i.e. the 360Giving API and extracted from accounts); the API returns historical data for this sample of funders reaching back to 2001, whereas accounts downloaded from the Charity Commission website date back only five years - 2020 at the point of writing. As such, there is a notable increase in the number of grants from 2020. This is a limitation to the strength of the analysis of giving volume, but it is still possible to explore patterns in grant amounts and other factors over time.

#set up analysis dfs
grants_source_yr = grants_df.groupby(["year", "source"]).agg({
    "grant_id": "count",
    "amount": ["sum", "mean"]
}).reset_index()
grants_source_yr.columns = ["year", "source", "num_grants", "total_amount", "avg_amount"]

grants_yr = grants_df.groupby("year").agg({
    "grant_id": "count",
    "amount": ["sum", "mean", "median"]
}).reset_index()
grants_yr.columns = ["year", "num_grants", "total_amount", "avg_amount", "median_amount"]

grants_over_1 = grants_df[grants_df["amount"] > 0].copy()
years_unique = sorted(grants_over_1["year"].unique())
data_by_year = [grants_over_1[grants_over_1["year"] == year]["amount"].values for year in years_unique]

Number of Grants

#pivot to look at grants by source and year
grants_pivot = grants_source_yr.pivot(index="year", columns="source", values="num_grants").fillna(0)

#create stacked bar chart
fig, ax = plt.subplots(figsize=(12, 6))
ax.stackplot(grants_pivot.index,
            grants_pivot["360Giving"],
            grants_pivot["Accounts"],
            labels=["360Giving", "Accounts"],
            colors=[colours[0], colours[1]],
            alpha=0.7)

ax.set_title("Number of Grants by Source Over Time", fontsize=14, fontweight="bold")
ax.set_ylabel("Number of Grants")
ax.legend(loc="upper left")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Values of Grants

#create line chart
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(grants_yr["year"], grants_yr["total_amount"], marker="o", linewidth=2, color=colours[2], label="Total")
for source in grants_source_yr["source"].unique():
    data = grants_source_yr[grants_source_yr["source"] == source]
    color = colours[0] if source == "360Giving" else colours[1]
    ax.plot(data["year"], data["total_amount"], marker=".", linewidth=1.5, color=color, alpha=0.4, linestyle="--", label=source, zorder=2)
ax.set_title("Total Amount Awarded Over Time", fontsize=14, fontweight="bold")
ax.set_ylabel("£ (millions)")
ax.legend(loc="upper left")
ax.grid(True, alpha=0.3)
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: f'{x/1_000_000}'))

plt.tight_layout()
plt.show()

Distribution of Grants

#create boxplot
fig, ax = plt.subplots(figsize=(12, 6))
bp = ax.boxplot(data_by_year, labels=years_unique, showfliers=True)

#set log scale
ax.set_yscale("log")

ax.set_title("Distribution of Grant Values", fontsize=14, fontweight="bold")
ax.set_ylabel("Grant Value (£, log10)")
ax.grid(True, alpha=0.3, axis="y")
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: f'£{int(x):,}'))

plt.tight_layout()
plt.show()

Exploration of Findings from Temporal and Financial Analysis

In the Number of Grants chart, the sharp increase from 2020 mostly reflects the data collection methodology rather than a genuine surge in grant-making - this is when the accounts data starts. Before 2020, we’re relying solely on 360Giving data which is patchier. However, the pandemic also dramatically affected grant-making, and this is the other explanatory factor for the sharp increase. The balance between the two sources shows that 360Giving provides good historical coverage back to the early 2000s, but accounts extraction is essential for capturing recent activity from funders who don’t publish to 360Giving.

In terms of giving values, there is notable year-on-year variation, with notable peaks in 2008, 2017, and 2021-22 and a general increase over time. The accounts data provides more stable coverage from 2020 onwards, but it is important to note that fluctuations in reporting (requirements) are a significant source of uncertainty. Grant values follow a log-normal distribution - most grants cluster in the £1,000-£10,000 range, but there’s a substantial tail of larger grants stretching into the hundreds of thousands.

The year-on-year consistency in the distribution suggests that typical grant sizes remain relatively stable over time, despite fluctuations in total volumes. The outliers above the whiskers represent major grants, often to large established charities or capital projects. Interestingly, the largest “grant” in the dataset was a transfer of funds from one foundation to its sister trust, representing a redistribution of assets rather than a true donation.

Exporting Dataframes back to Supabase

Table and Relationships Setup

#get final funder table
funders_table = funders_df[[
    "registered_num",
    "name",
    "website",
    "activities",
    "objectives",
    "achievements_performance",
    "grant_policy",
    "objectives_activities",
    "income_latest",
    "expenditure_latest",
    "is_on_list",
    "is_potential_sbf"
]]

#get final recipients table
recipients_table = grants_df[[
    "recipient_id",
    "recipient_name",
    "recipient_activities"
]].drop_duplicates(subset=["recipient_id"])

#add check for cleaned recipient ids
recipients_table["has_cleaned_id"] = recipients_table["recipient_id"].isin(id_mapping["recipient_id"].values)

#get final grants table
grants_table = grants_df[[
    "grant_id",
    "grant_title",
    "grant_desc",
    "amount",
    "year",
    "source"
]]

#dictionary to hold tables and their keys
tables = {
    "funders": (funders_table, "registered_num"),
    "grants": (grants_table, "grant_id"),
    "recipients": (recipients_table, "recipient_id")
}

Updated Recipient IDs - Join Tables

supabase = create_client(url, key)

#get old ids
old_recip_ids = id_mapping["original_recipient_id"].tolist()

if len(old_recip_ids) > 0:

    #remvoe from join tables
    join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
    
    for table in join_tables:
        try:
            batch_size = 100
            for i in range(0, len(old_recip_ids), batch_size):
                batch = old_recip_ids[i:i+batch_size]
                response = supabase.table(table).delete().in_("recipient_id", batch).execute()
        except Exception as e:
            print(f"Error deleting from {table}: {e}")
    
    #remove from recipients table
    try:
        batch_size = 100
        for i in range(0, len(old_recip_ids), batch_size):
            batch = old_recip_ids[i:i+batch_size]
            response = supabase.table("recipients").delete().in_("recipient_id", batch).execute()
    except Exception as e:
        print(f"Error deleting from recipients: {e}")
    
    print("Deletion successful")
else:
    print("No IDs to clean.")

Deletion successful

#make dictionary to map ids and unique ids
id_map_dict = dict(zip(id_mapping["original_recipient_id"], id_mapping["recipient_id"]))
recipient_ids = set(recipients_table["recipient_id"].values)

#update all join tables
updated_recipient_grants = update_join_table(recipient_grants, id_map_dict, recipient_ids)
updated_recipient_areas = update_join_table(recipient_areas, id_map_dict, recipient_ids)
updated_recipient_causes = update_join_table(recipient_causes, id_map_dict, recipient_ids)
updated_recipient_beneficiaries = update_join_table(recipient_beneficiaries, id_map_dict, recipient_ids)

#drop helper column
recipients_table = recipients_table.drop(columns=["has_cleaned_id"])

Piping Data Back to Supabase

#upsert main tables
pipe_to_supabase(funders_table, "funders", "registered_num", url, key)
pipe_to_supabase(grants_table, "grants", "grant_id", url, key)
pipe_to_supabase(recipients_table, "recipients", "recipient_id", url, key)

#upsert join tables
pipe_to_supabase(updated_recipient_grants, "recipient_grants", "recipient_grants_id", url, key)
pipe_to_supabase(updated_recipient_areas, "recipient_areas", "recipient_areas_id", url, key)
pipe_to_supabase(updated_recipient_causes, "recipient_causes", "recipient_cause_id", url, key)
pipe_to_supabase(updated_recipient_beneficiaries, "recipient_beneficiaries", "recipient_ben_id", url, key)

print("Data upserted successfully")

✓ Successfully upserted all 996 records to funders
✓ Successfully upserted all 30716 records to grants
✓ Successfully upserted all 19466 records to recipients
✓ Successfully upserted all 30831 records to recipient_grants
✓ Successfully upserted all 18341 records to recipient_areas
✓ Successfully upserted all 21319 records to recipient_causes
✓ Successfully upserted all 16843 records to recipient_beneficiaries
Data upserted successfully