Exploratory Data Analysis for prospie

For my MSc final project, I wrangled, cleaned and analysed data retrieved from the Charity Commission and 360Giving APIs, and extracted from charity accounts.
data science
fundraising
python
data analysis
insights
Author

Liam Cottrell

Published

October 14, 2025

Exploratory Data Analysis

Setup

import pandas as pd
from supabase import create_client
from pathlib import Path
import os
import sys
from dotenv import load_dotenv
import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns
colours = sns.color_palette("Set2")
import random
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
pd.options.mode.chained_assignment = None

project_root = os.path.abspath('..')
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from eda_utils import add_gbp_columns, get_longest_values, print_in_rows, check_names, check_overlap, clean_start_of_text, update_join_table
from stats_builder import make_summary_df, calculate_stats, make_calculated_df, format_stats, format_df
from plots_builder import make_bar_chart
from utils import get_table_from_supabase, build_relationship_cols, build_financial_history, add_grant_statistics
from data_importer import pipe_to_supabase

#get keys from env
load_dotenv()
url = os.getenv("SUPABASE_URL")
key = os.getenv("SUPABASE_KEY")

Retrieving Data from Supabase and Building Dataframes

I will connect to Supabase and retrieve all records, in order to start building my analysis dataframes. I will create one dataframe for funder information, and another for grants and recipients information.

#get tables and build dataframes
tables = ["funders", "causes", "areas", "beneficiaries", "grants",
               "funder_causes", "funder_areas", "funder_beneficiaries", "funder_grants", 
               "financials", "funder_financials"]

for table in tables:
    globals()[table] = get_table_from_supabase(url, key, table)

#get recipients with filter
recipients = get_table_from_supabase(url, key, "recipients", batch_size=50, filter_recipients=True)
all_recipient_ids = set(recipients["recipient_id"].unique())

#get and filter recipient join tables
recipient_join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
for table in recipient_join_tables:
    df = get_table_from_supabase(url, key, table)
    globals()[table] = df[df["recipient_id"].isin(all_recipient_ids)]

The Funders Dataframe

Main Table

funders_df = funders.copy()

#define table relationships for funders
funder_rels = [
    {
        "join_table": funder_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "causes"
    },
    {
        "join_table": funder_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "areas"
    },
    {
        "join_table": funder_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "beneficiaries"
    }
]

#add relationship columns
funders_df = build_relationship_cols(funders_df, "registered_num", funder_rels)

#add grant statistics columns
funders_df = add_grant_statistics(funders_df, "registered_num", funder_grants, grants)

#round to 2 decimal places
funders_df = funders_df.round(2)
pd.set_option("display.float_format", "{:.2f}".format)

#format financial columns
float_cols = ["income_latest", "expenditure_latest", "total_given", "avg_grant", "median_grant"]
for col in float_cols:
    if col in funders_df.columns:
        funders_df[col + "_gbp"] = funders_df[col].apply(add_gbp_columns)

Financial History Table

funders_df = build_financial_history(funders_df, "registered_num", funder_financials, financials)

The List Entries

#get list entries
list_entries = get_table_from_supabase(url, key, "list_entries")
funder_list = get_table_from_supabase(url, key, "funder_list")
list_with_info = funder_list.merge(list_entries, on="list_id")

#get list of entries for each funder
list_grouped = list_with_info.groupby("registered_num")["list_info"].apply(list).reset_index()
list_grouped.columns = ["registered_num", "list_entries"]

#merge with funders and replace nans
funders_df = funders_df.merge(list_grouped, on="registered_num", how="left")
funders_df["list_entries"] = funders_df["list_entries"].apply(lambda x: x if isinstance(x, list) else [])
#extend column view, sort and preview funders
pd.set_option("display.max_columns", 100)
funders_df = funders_df.sort_values("total_given_gbp", ascending=False)
funders_df.head()
registered_num name website activities objectives income_latest expenditure_latest objectives_activities achievements_performance grant_policy is_potential_sbf is_on_list is_nua causes areas beneficiaries num_grants total_given avg_grant median_grant income_latest_gbp expenditure_latest_gbp total_given_gbp avg_grant_gbp median_grant_gbp income_history expenditure_history list_entries
872 288086 FARRER AND CO CHARITABLE TRUST https://www.farrer.co.uk THE MAIN PURPOSE OF THE TRUST IS TO APPLY INCO... IN OR TOWARDS THE GENERAL PURPOSES OF SUCH CHA... 80569.00 78229.00 None None None False False False [General Charitable Purposes] [Throughout England And Wales] [Other Charities Or Voluntary Bodies] 14 97590.14 6970.72 4000.00 £80,569.00 £78,229.00 £97,590.14 £6,970.72 £4,000.00 {2020: 78285.0, 2021: 52439.0, 2022: 79299.0, ... {2020: 76431.0, 2021: 61395.0, 2022: 76795.0, ... []
51 1041449 MIRACLES https://www.miraclesthecharity.org MIRACLES IS A CHARITY OFFERING HOPE FOR THOSE ... THE RELIEF OF POVERTY AND HARDSHIP OF PERSONS ... 351143.00 908121.00 THE AIMS AND OBJECTS OF MIRACLES REMAIN TO PRO... THIRTY-ONE YEARS ON SINCE THE LAUNCH OF MIRACL... None False False False [General Charitable Purposes, Education/traini... [Scotland, Europe, Throughout England] [The General Public/mankind] 8 96073.00 12009.12 2750.00 £351,143.00 £908,121.00 £96,073.00 £12,009.12 £2,750.00 {2020: 235183.0, 2021: 325035.0, 2022: 303775.... {2020: 322068.0, 2021: 337276.0, 2022: 274309.... []
968 526956 KELSICK'S EDUCATIONAL FOUNDATION https://www.kelsick.org.uk TO PROVIDE GRANTS FOR EDUCATIONAL NEEDS TO YOU... 1)THE PROVISION OF SPECIAL BENEFITS, OF ANY KI... 411338.00 408717.00 THE OBJECTS OF THE CHARITY ARE:\n1. THE PROVIS... THE KELSICK'S EDUCATION FOUNDATION CONTINUES T... THE FOUNDATION INVITES APPLICATIONS FOR GRANTS... False False False [Education/training] [Cumbria] [Children/young People, People With Disabiliti... 17 956424.00 56260.24 58800.00 £411,338.00 £408,717.00 £956,424.00 £56,260.24 £58,800.00 {2020: 416991.0, 2021: 387390.0, 2022: 407536.... {2020: 408890.0, 2021: 381641.0, 2022: 382351.... []
772 260741 A P TAYLOR TRUST https://www.aptaylortrust.org.uk THE YEARLY INCOME OF THE FUND PROVIDED BY THE ... FUND FOR THE USE OF THE INHABITANTS OF THE PAR... 111430.00 136811.00 THE YEARLY INCOME OF THE FUND PROVIDED BY THE ... DURING THE YEAR, THE CHARITY CHOSE TO OFFER AD... DURING THE YEAR, A TOTAL AMOUNT OF £36,450 WAS... False False False [General Charitable Purposes, The Advancement ... [Hillingdon, Greater London] [Children/young People, Elderly/old People, Pe... 75 95240.00 1269.87 300.00 £111,430.00 £136,811.00 £95,240.00 £1,269.87 £300.00 {2020: 112954.0, 2021: 105886.0, 2022: 98342.0... {2020: 114393.0, 2021: 124987.0, 2022: 114089.... []
806 270718 FRANCIS COALES CHARITABLE FOUNDATION https://franciscoales.co.uk TO ASSIST WITH GRANTS FOR THE STRUCTURAL REPAI... TO PROVIDE FUNDS DIRECTLY OR BY WAY OF GRANT O... 117420.00 118765.00 None None None False False False [Environment/conservation/heritage] [Buckinghamshire, Hertfordshire, Northamptonsh... [Other Defined Groups] 3 93953.00 31317.67 5000.00 £117,420.00 £118,765.00 £93,953.00 £31,317.67 £5,000.00 {2020: 111553.0, 2021: 134468.0, 2022: 149302.... {2020: 129452.0, 2021: 132270.0, 2022: 120004.... []
#get checkpoint folder
# checkpoint_folder = Path("./6.1_checkpoints/")

#create checkpoint - save df to pickle
# funders_df.to_pickle(checkpoint_folder / "funders_df.pkl")
# print("Saved funders_df to checkpoint")

The Grants Dataframe

Main Table

grants_df = grants.copy()

#ddd funder info
grants_df = grants_df.merge(funder_grants, on="grant_id")
grants_df = grants_df.merge(funders[["registered_num", "name"]], on="registered_num")
grants_df = grants_df.rename(columns={"name": "funder_name", "registered_num": "funder_num"})

#ddd recipient info  
grants_df = grants_df.merge(recipient_grants, on="grant_id")
grants_df = grants_df.merge(recipients[["recipient_id", "recipient_name", "recipient_activities"]], 
                        on="recipient_id", 
                        how="left")

#define relationships for recipients
recipient_rels = [
    {
        "join_table": recipient_areas,
        "lookup_table": areas,
        "key": "area_id",
        "value_col": "area_name",
        "result_col": "recipient_areas"
    },
    {
        "join_table": recipient_causes,
        "lookup_table": causes,
        "key": "cause_id",
        "value_col": "cause_name",
        "result_col": "recipient_causes"
    },
    {
        "join_table": recipient_beneficiaries,
        "lookup_table": beneficiaries,
        "key": "ben_id",
        "value_col": "ben_name",
        "result_col": "recipient_beneficiaries"
    }
]

#add relationship columns
grants_df = build_relationship_cols(grants_df, "recipient_id", recipient_rels)

#add source of grant
grants_df["source"] = grants_df["grant_id"].apply(lambda x: "Accounts" if str(x).startswith("2") else "360Giving")

#round to 2 decimal places
grants_df = grants_df.round(2)

#format financial columns
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)
#sort and preview grants
grants_df = grants_df.sort_values("grant_title", ascending=True)
grants_df.head()
grant_title grant_desc amount year grant_id source funder_num funder_grants_id funder_name recipient_id recipient_grants_id recipient_name recipient_activities recipient_areas recipient_causes recipient_beneficiaries amount_gbp
24592 "A BLUE NEW DEAL FOR UK COSTAL COMMUNITIES" PR... TOWARDS THE COSTS OF THE 'BLUE NEW DEAL', WHIC... 90000.00 2014 360G-Ellerman-15189 360Giving 263207 44987 JOHN ELLERMAN FOUNDATION 1055254 43799 NEW ECONOMICS FOUNDATION NEF AIMS TO MAXIMISE WELL-BEING AND SOCIAL JUS... [Throughout England And Wales] [Education/training, Economic/community Develo... [The General Public/mankind] £90,000.00
29639 "BEFRIENDING COFFEE CLUB" THE BEFRIENDING COFFEE CLUB WAS A TWO-YEAR PRO... 27313.00 2017 360G-PeoplesHealthTrust-2017_5 360Giving 1125537 41014 PEOPLE'S HEALTH TRUST 1166949 39824 SANDWELL AFRICAN WOMEN ASSOCIATION 1. ADVICE & SUPPORT\rA) GENERAL HELP WITH CLAI... [Sandwell, West Midlands] [General Charitable Purposes, The Advancement ... [Children/young People, Elderly/old People, Pe... £27,313.00
29462 "BETTER LIVES" THIS TWO-YEAR PROJECT DELIVERED ACTIVITIES AND... 43982.00 2017 360G-PeoplesHealthTrust-2017_103 360Giving 1125537 40917 PEOPLE'S HEALTH TRUST 1093240 39727 SOUTHEND CARERS TO PROVIDE INFORMATION, ADVICE AND SUPPORT TO ... [Southend-on-sea, Essex] [Disability, Other Charitable Purposes] [Children/young People, Elderly/old People, Pe... £43,982.00
24854 "BLUE PANET II - TURNING VIEWRS INTO CHAMPIONS... TOWARDS A JOINT INITIATIVE BY MCS, GREENPEACE ... 6000.00 2017 360G-Ellerman-17137 360Giving 263207 45242 JOHN ELLERMAN FOUNDATION 1004005 44054 MARINE CONSERVATION SOCIETY THE MARINE CONSERVATION SOCIETY FIGHTS FOR THE... [Scotland, Europe, Turks And Caicos Islands, N... [Education/training, Arts/culture/heritage/sci... [The General Public/mankind] £6,000.00
25044 "BODY VESSEL CLAY – WOMEN RETHINKING CERAMICS"... "BODY VESSEL CLAY – WOMEN RETHINKING CERAMICS"... 30000.00 2021 360G-Ellerman-18459 360Giving 263207 45468 JOHN ELLERMAN FOUNDATION 1123081 44280 BULLDOG TRUST LIMITED THE TRUST PROVIDES FINANCIAL AND ADVISORY ASSI... [Throughout England And Wales] [General Charitable Purposes, Arts/culture/her... [The General Public/mankind] £30,000.00
#create checkpoint - save df to pickle
# grants_df.to_pickle(checkpoint_folder / "grants_df.pkl")
# print("Saved grants_df to checkpoint")

Retrieving Data from Checkpoints

#get checkpoint folder
checkpoint_folder = Path("./6.1_checkpoints/")

#get checkpoint
funders_df = pd.read_pickle(checkpoint_folder / "funders_df.pkl")
grants_df = pd.read_pickle(checkpoint_folder / "grants_df.pkl")

Data Quality

1. Missingness

I will view the structure of each dataframe to check for missing data and confirm that datatypes are correct.

Analysis of Missingness in Funders Dataframe

funders_df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 996 entries, 820 to 995
Data columns (total 28 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   registered_num            996 non-null    object 
 1   name                      996 non-null    object 
 2   website                   497 non-null    object 
 3   activities                991 non-null    object 
 4   objectives                996 non-null    object 
 5   income_latest             996 non-null    float64
 6   expenditure_latest        996 non-null    float64
 7   objectives_activities     229 non-null    object 
 8   achievements_performance  193 non-null    object 
 9   grant_policy              52 non-null     object 
 10  is_potential_sbf          996 non-null    bool   
 11  is_on_list                996 non-null    bool   
 12  is_nua                    996 non-null    bool   
 13  causes                    996 non-null    object 
 14  areas                     996 non-null    object 
 15  beneficiaries             996 non-null    object 
 16  num_grants                309 non-null    Int64  
 17  total_given               309 non-null    float64
 18  avg_grant                 309 non-null    float64
 19  median_grant              309 non-null    float64
 20  income_latest_gbp         996 non-null    object 
 21  expenditure_latest_gbp    996 non-null    object 
 22  total_given_gbp           996 non-null    object 
 23  avg_grant_gbp             996 non-null    object 
 24  median_grant_gbp          996 non-null    object 
 25  income_history            996 non-null    object 
 26  expenditure_history       996 non-null    object 
 27  list_entries              996 non-null    object 
dtypes: Int64(1), bool(3), float64(5), object(19)
memory usage: 206.2+ KB
#missing activities - check manually
missing_activities = funders_df[funders_df["activities"].isna()]["registered_num"].tolist()
print(missing_activities)
['1207622', '1203444', '1203389', '1203342', '1204043']
#missing sections - check proportion of funders with sections extracted from accounts
has_sections = funders_df[
    funders_df["objectives_activities"].notna() |
    funders_df["achievements_performance"].notna() |
    funders_df["grant_policy"].notna()
]

has_sections_total = len(has_sections)
has_sections_proportion = has_sections_total / 996

print(f"Funders with accessible accounts: 327")
print(f"Funders with extracted sections: {has_sections_total}")
print(f"Proportion of funders with sections: {has_sections_proportion:.2%}")
Funders with accessible accounts: 327
Funders with extracted sections: 242
Proportion of funders with sections: 24.30%
#missing classifications - check manually
classifications = ["causes", "areas", "beneficiaries"]

for classification in classifications:
    missing = funders_df[funders_df[classification].apply(lambda x: len(x) == 0)]
    print(f"{len(missing)} ({len(missing)/len(funders_df)*100:.1f}%) missing from {classification}")
    print(f"Registered numbers: {missing['registered_num'].tolist()}\n")
0 (0.0%) missing from causes
Registered numbers: []

0 (0.0%) missing from areas
Registered numbers: []

7 (0.7%) missing from beneficiaries
Registered numbers: ['1180365', '1203444', '1180966', '1181515', '1174620', '1191072', '1166657']

Exploration of Findings from Missingness Analysis (Funders)

There are five funders in the database with empty activities. I have checked the Charity Commission website and it does appear that these funders have simply not declared any activities. They are all relatively new having submitted only one set of accounts. The activities_objectives column has not been populated for any of these funders, which is unfortunate as this could have provided a further source of information. One of these funders has a website so, if I am able to achieve my stretch target of scraping websites, this may offer details (although at the time of writing, the website does not exist).

It is expected that a significant proportion of funders would not have a website, and no action is required to address this.

Checking missing classifications shows that the dataset is almost entirely complete in this regard; just seven funders are missing beneficiaries, and having checked the Charity Commission website, it appears that these funders have not declared any beneficiaries. This is such a small proportion of the dataset that I am not concerned about its impact on analysis and embeddings.

Problem with Accounts

My database building scripts (for PDFs) have a serious limitation as I am unable to scrape the Charity Commission website for accounts where the page contains JavaScript. The older pages, which are basic HTML, are accessible but the newer ones are not, and I am unable to tell which charities have been updated to the new system until the script attempts to scrape them and fails. Having built the database, I can now see from the print statements that 327 funders have accessible accounts, representing 32.8% of the 996 funders in the sample. My calculations above show that the required sections have been extracted from accounts for 24.4% of funders - varying from 52 funders with grant_policy, to 193 funders with achievements_peformance and 229 funders with objectives_activities.

Analysis of Missingness in Grants Dataframe

grants_df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 33152 entries, 6305 to 33151
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   grant_title              18116 non-null  object 
 1   grant_desc               18116 non-null  object 
 2   amount                   33106 non-null  float64
 3   year                     33152 non-null  int64  
 4   grant_id                 33152 non-null  object 
 5   source                   33152 non-null  object 
 6   funder_num               33152 non-null  object 
 7   funder_grants_id         33152 non-null  int64  
 8   funder_name              33152 non-null  object 
 9   recipient_id             33152 non-null  object 
 10  recipient_grants_id      33152 non-null  int64  
 11  recipient_name           33152 non-null  object 
 12  recipient_activities     15829 non-null  object 
 13  recipient_areas          33152 non-null  object 
 14  recipient_causes         33152 non-null  object 
 15  recipient_beneficiaries  33152 non-null  object 
 16  amount_gbp               33152 non-null  object 
dtypes: float64(1), int64(3), object(13)
memory usage: 4.6+ MB
#missing grant title and description - check numbers per source
missing_by_source = grants_df.groupby("source").agg({
    "grant_title": lambda x: x.isna().sum(),
    "grant_desc": lambda x: x.isna().sum(),
    "grant_id": "count"
}).rename(columns={"grant_id": "total_grants", "grant_title": "missing_title", "grant_desc": "missing_desc"})

print(missing_by_source)
           missing_title  missing_desc  total_grants
source                                              
360Giving              0             0         18116
Accounts           15036         15036         15036
#missing grant amounts - spotcheck manually
missing_amounts = grants_df[grants_df["amount"].isna()]["funder_num"].tolist()
print(missing_amounts)
['217929', '217929', '217929', '217929', '217929', '217929', '217929', '217929', '252222', '1052958', '525834', '244053', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '503384', '219289', '1017255', '803301', '217929', '217929', '217929', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '237048', '291363', '291363', '291363', '291363', '268745', '1059451']
#missing recipient activities - compare by source
missing_recips = grants_df["recipient_activities"].isna().sum()
accounts_recips = grants_df[
    (grants_df["source"] == "Accounts") &
    (grants_df["recipient_activities"].isna())
]
missing_360g = grants_df[
    (grants_df["source"] == "360Giving") &
    (grants_df["recipient_activities"].isna())
]

print(f"Grants extracted from accounts: {len(accounts_recips):,} ({len(accounts_recips)/missing_recips*100:.1f}%)")
print(f"360Giving recipients with missing activities: {len(missing_360g):,} ({len(missing_360g)/missing_recips*100:.1f}%)")
Grants extracted from accounts: 11,491 (66.3%)
360Giving recipients with missing activities: 5,832 (33.7%)

Exploration of Findings from Missingness Analysis (Grants)

There are 33,147 rows in the dataframe, 287 more than the number of unique grants as some grants are given to multiple recipients. The deduplication script in stats_builder.py ensures that the value of each grant has only been counted once for the purpose of the summary/calculated statistics.

I have explored the missing data from grant_title and grant_desc to check that only grants extracted from accounts are missing these fields. This was to be expected as these details are not available for the grants from accounts; the figures above confirm that every grant from the 360Giving API is complete with a title and description.

Of the 32,815 grants in the database, 46 (0.14%) are missing amount. I have performed a spot-check of 12 grants (25%), which suggests that these are genuine extraction errors from funder accounts rather than systematic issues. I have decided to keep these 46 grants in the database as they still provide valuable information about funder-recipient relationships and giving patterns, and such a small number of missing amounts is not likely to affect analyses/models.

I also explored the 17,318 rows in grants_df with missing recipient_activities. Null values in this column are to be expected for recipients that are added following extraction from PDFs, which accounts for 11,486 (66.3%). The remaining 5,832 (33.7%) null values - which are present within records added to the database by the 360Giving API - can be explained by: - Charities that have been removed from the Charity Commission, e.g. if they have closed down - Registered charities that simply do not declare their activities (as present in 5 records in funders_df) - Where abnormalities are evident in registered_num. For example, the RSPB is identified in the 360Giving API by 207076 & SC037654 - its registered numbers for both the Charity Commission of England & Wales and the Office of the Scottish Charity Regulator. My database builder script was unable to match on 207076 & SC037654 as registered_num.


2. Word Counts

I will check the lengths of the shortest and longest text entries, to ensure that they have been imported correctly and are not too short or long for unexpected reasons. Many funders provide very short explanations of their activities/objectives etc., such as simply “grant-giving” which is just one word - so this would not be abnormal. I will confirm that particularly long entries are not corrupted or the result of multiple documents being combined accidentally.

Analysis of Word Counts in Funders Dataframe

#check word counts for text columns in funders df
funders_text_cols = ["activities", "objectives", "objectives_activities", "achievements_performance", "grant_policy"]

#create columns
for col in funders_text_cols:
    funders_df[f"word_count_{col}"] = funders_df[col].str.split().str.len()
for col in funders_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = funders_df[funders_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = funders_df.loc[[min_idx, max_idx], ["registered_num", "name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")
ACTIVITIES
______________________________
registered_num name word_count_activities activities
Minimum 277767 WANSTEAD HIGH SCHOOL PARENT-TEACHER ASSOCIATION 1.00 FUNDRAISING
Maximum 217929 BARTON UNDER NEEDWOOD AND DUNSTALL KEY TRUST 76.00 TO PAY THE MINISTER OF THE CHURCH OF ST JAMES ...
OBJECTIVES
______________________________
registered_num name word_count_objectives objectives
Minimum 213077 EPPING AND THEYDON GARNON CHARITIES 2 SEE CONSTITUENTS
Maximum 1104413 INSPIRE HOUNSLOW 524 3. THE OBJECTS OF THE COMPANY (THE "OBJECTS") ...
OBJECTIVES_ACTIVITIES
______________________________
registered_num name word_count_objectives_activities objectives_activities
Minimum 1030939 TESLER FOUNDATION 16.00 OBJECTIVES AND AIMS THE CHARITY IS GOVERNED BY...
Maximum 263207 JOHN ELLERMAN FOUNDATION 5426.00 TARGET OF CASH + 4% ANNUALLY, NET OF COSTS. A ...
ACHIEVEMENTS_PERFORMANCE
______________________________
registered_num name word_count_achievements_performance achievements_performance
Minimum 802125 TOMCHEI TORAH CHARITABLE TRUST 14.00 CHARITABLE ACTIVITIES DURING THE YEAR THE CHAR...
Maximum 263207 JOHN ELLERMAN FOUNDATION 5426.00 TARGET OF CASH + 4% ANNUALLY, NET OF COSTS. A ...
GRANT_POLICY
______________________________
registered_num name word_count_grant_policy grant_policy
Minimum 1057692 DOWNEND ROUND TABLE 8.00 POLICY ON SOCIAL INVESTMENT INCLUDING PROGRAM ...
Maximum 299918 C A REDFERN CHARITABLE FOUNDATION 1181.00 THE TRUSTEES MEET REGULARLY TO DISCUSS THE MAK...

Word Counts for Sections Returned from Charity Commission API

#high word counts - check top 10 longest sections from API (to manually check)
long_activities = get_longest_values(funders_df, "word_count_activities", "registered_num")
long_objectives = get_longest_values(funders_df, "word_count_objectives", "registered_num")

api_sections_long = [
    ("activities", long_activities),
    ("objectives", long_objectives)
]

for name, longest in api_sections_long:
    print(f"Top 10 longest {name}: {longest}")
Top 10 longest activities: ['217929', '213077', '207342', '298379', '516301', '1135751', '272671', '1114624', '1197536', '1193970']
Top 10 longest objectives: ['1104413', '310799', '1132994', '229974', '1190948', '1194823', '1094675', '1047947', '1173674', '1147921']
#low word counts - count and view short sections from API
api_sections_short = {
    "activities": "Short activities sections",
    "objectives": "Short objectives sections"
}

for col, title in api_sections_short.items():
    funders_df[f"short_{col}_section"] = (
        (funders_df[col].str.split().str.len() < 5) &
        (funders_df[col].str.upper().str.strip() != "GENERAL CHARITABLE PURPOSES")
    )
    short_sections = funders_df[funders_df[f"short_{col}_section"]][col].dropna().unique()
    
    print(f"\n{title}: {funders_df[f'short_{col}_section'].sum():,}")
    print_in_rows(short_sections, 5)

Short activities sections: 39
A GRANT GIVING TRUST, CHARITABLE PURPOSES, GENERAL CHARITABLE PURPOSES., GRANT MAKING, CHARITABLE GIVING
PROJECTS FOR NEEDY, FUNDRAISING, GRANT MAKING CHARITY, GRANT-MAKING CHARITY, GENERAL CHARITABLE DONATIONS
GRANT GIVING., MAKING GRANTS, GRANT-MAKING, ANIMAL WELFARE, AT THE TRUSTEES DISCRETION.
FUNDING CHARITABLE CAUSES., GRANT GIVING CHARITY, GRANT-MAKING., ASSISTANCE WITH RELIGIOUS EDUCATION, ENCOURAGING CHARITABLE GIVING
ADVANCING ORTHODOX JEWISH FAITH, GRANTS TO ADVANCE EDUCATION., RELIEF OF POOR, HELPING DISADVANTAGED CHILDREN, EDUCATION / TRAINING
NONE, PROVIDING GRANTS, GENERAL CHARITABLE ACTIVITIES, THROUGHOUT ENGLAND AND WALES, NONE UNDERTAKEN
CHARITABLE GRANTS MADE, SUPPORT LOCAL CHARITIES.

Short objectives sections: 15
SEE INDIVIDUAL CONSTITUENTS, GENERAL CHARITABLE PURPOSES., SEE INDIVIDUAL CONSTITUENTS., SEE CONSTITUENTS, FOR CHARITABLE PURPOSES.
ANY CHARITABLE PURPOSE.
#low word counts - check activities containing "NONE"
activities_none = funders_df["activities"].str.contains("NONE", na=False)
funders_df[activities_none]
registered_num name website activities objectives income_latest expenditure_latest objectives_activities achievements_performance grant_policy is_potential_sbf is_on_list is_nua causes areas beneficiaries num_grants total_given avg_grant median_grant income_latest_gbp expenditure_latest_gbp total_given_gbp avg_grant_gbp median_grant_gbp income_history expenditure_history list_entries word_count_activities word_count_objectives word_count_objectives_activities word_count_achievements_performance word_count_grant_policy short_activities_section short_objectives_section
355 505515 ALLENDALE EXHIBITION ENDOWMENT None NONE THE INCOME OF THE CHARITY SHALL BE APPLIED BY-... 24000.00 14299.00 None None None False False False [Education/training] [Northumberland] [Children/young People] <NA> NaN NaN NaN £24,000.00 £14,299.00 {2020: 7044.0, 2021: 4100.0, 2022: 13836.0, 20... {2020: 7190.0, 2021: 5000.0, 2022: 13083.0, 20... [] 1.00 111 NaN NaN NaN True False
367 1103709 MARCIA AND ANDREW BROWN CHARITABLE TRUST None NONE UNDERTAKEN TO BENEFIT SUCH EXCLUSIVELY CHARITABLE PURPOSE... 304104.00 190330.00 None None None False False False [General Charitable Purposes] [Throughout England And Wales] [The General Public/mankind] <NA> NaN NaN NaN £304,104.00 £190,330.00 {2020: 201144.0, 2021: 400020.0, 2022: 1850115... {2020: 33618.0, 2021: 63195.0, 2022: 87510.0, ... [] 2.00 18 NaN NaN NaN True False

Word Counts for Sections Extracted from PDF Accounts

#high word counts - check top 10 longest extracted sections (to manually check)
long_obj_act = get_longest_values(funders_df, "word_count_objectives_activities", "registered_num")
long_ach_perf = get_longest_values(funders_df, "word_count_achievements_performance", "registered_num")
long_grant_policy = get_longest_values(funders_df, "word_count_grant_policy", "registered_num")

accounts_sections_long = [
    ("objectives_activities", long_obj_act),
    ("achievements_performance", long_ach_perf),
    ("grant_policy", long_grant_policy)
]

for name, longest in accounts_sections_long:
    print(f"Top 10 longest {name}: {longest}")
Top 10 longest objectives_activities: ['263207', '803428', '252892', '213890', '1041449', '296284', '217929', '288068', '299918', '260589']
Top 10 longest achievements_performance: ['263207', '803428', '200198', '213890', '1041449', '217929', '299918', '254532', '296284', '288068']
Top 10 longest grant_policy: ['299918', '1038860', '216308', '288068', '257636', '1026752', '272161', '255281', '226121', '1040778']
#low word counts - count and view short sections from accounts
accounts_sections_short = {
    "objectives_activities": "Short objectives_activities sections",
    "achievements_performance": "Short achievements_performance sections",
    "grant_policy": "Short grant_policy sections"
}

for col, title in accounts_sections_short.items():
    funders_df[f"short_{col}_section"] = (
        (funders_df[col].str.split().str.len() < 20)
    )
    short_sections = funders_df[funders_df[f"short_{col}_section"]][col].dropna().unique()
    
    print(f"\n{title}: {funders_df[f'short_{col}_section'].sum():,}")
    print_in_rows(short_sections, 2)

Short objectives_activities sections: 3
THE TRUSTEES CONTINUALLY CONSIDER THE ONGOING POSITION OF THE FARM LAND, PARTICULARLY IN VIEW OF POSSIBLE DEVELOPMENT., - - - - TOTAL INCOMING RESOURCES  52,119 - 52,119 43,051    __ _ ________ _____ __ ___ ________
OBJECTIVES AND AIMS THE CHARITY IS GOVERNED BY A DEED OF TRUST DATED 21ST FEBRUARY 1992.

Short achievements_performance sections: 1
CHARITABLE ACTIVITIES DURING THE YEAR THE CHARITY MADE CHARITABLE GRANTS TOTALLING £13,374 (2022: £15,530).

Short grant_policy sections: 10
POLICY ON SOCIAL INVESTMENT INCLUDING PROGRAM RELATED INVESTMENT, GRANTS ARE MADE TO CHARITABLE INSTITUTIONS, ORGANIZATIONS AND AUTHORIZED INDIVIDUALS WHICH ACCORD WITH THE OBJECTS OF THE CHARITY.
GRANTS ARE MADE TO CHARITABLE INSTITUTIONS AND ORGANIZATIONS WHICH ACCORD WITH THE OBJECTS OF THE CHARITY.

Exploration of Findings from Word Count Analysis (Funders)

Sections from API

Having manually checked the maximum values against their Charity Commission records, I am confident that there are no entries that are concerningly long. The top 10 longest activities are not abnormal; they are just relatively wordy compared to others. I viewed the shortest values (fewer than five words) and concluded that they are acceptable, although I will reassign the values containing “NONE” to N/A to improve the quality of the embedding.

Sections from Accounts

The word count analysis has highlighted that there are numerous issues with the sections that have been extracted from accounts. Many that are too long contain multiple sections, whereas many that are too short have been cut off prematurely. There is also a problem of repetition - the extractor sometimes extracts the same (excessively long) chunk of text for both objectives_activities and achievements_performance, defeating the purpose. Looking at the top 10 shortest and longest has demonstrated that the regex extraction technique employed in the database builder has not worked very well. Rather than rebuilding the entire PDF extraction pipeline, I will address this data quality issue by re-processing the extracted text through an LLM-based extractor with the aim of better handling the variation in section sizes and formatting. The LLM should be better at the fuzzy matching that regex has struggled with.

Analysis of Word Counts in Grants Dataframe

#check word counts for text columns in grants df
grants_text_cols = ["grant_title", "grant_desc", "recipient_activities"]

#create columns
for col in grants_text_cols:
    grants_df[f"word_count_{col}"] = grants_df[col].str.split().str.len()
for col in grants_text_cols:
    print(f"{col.upper()}")
    print(f"{'_'*30}\n")

    word_count_col = f"word_count_{col}"
    not_nas = grants_df[grants_df[word_count_col].notna()]

    #get minimums and maximums for each text column
    if len(not_nas) > 0:
        min_idx = not_nas[word_count_col].idxmin()
        max_idx = not_nas[word_count_col].idxmax()

        examples = grants_df.loc[[min_idx, max_idx], ["funder_num", "funder_name", "grant_id", "recipient_name", word_count_col, col]]
        examples.index = ["Minimum", "Maximum"]

        display(examples)
    else:
        print("No data available\n")
GRANT_TITLE
______________________________
funder_num funder_name grant_id recipient_name word_count_grant_title grant_title
Minimum 1125537 PEOPLE'S HEALTH TRUST 360G-PeoplesHealthTrust-2015_153 BEAT ROUTES 1.00 #LEVELS
Maximum 263207 JOHN ELLERMAN FOUNDATION 360G-Ellerman-18741 MORE IN COMMON 38.00 MORE IN COMMON WILL PROVIDE SUPPORT FOR THE FO...
GRANT_DESC
______________________________
funder_num funder_name grant_id recipient_name word_count_grant_desc grant_desc
Minimum 263207 JOHN ELLERMAN FOUNDATION 360G-Ellerman-17024 CLORE SOCIAL LEADERSHIP PROGRAMME 1.00 BURSARIES
Maximum 1091263 LONDON COMMUNITY FOUNDATION 360G-LondonCF-A583241 ACLE EVENING WOMEN'S INSTITUTE 259.00 MAJORITY OF OUR SERVICE USERS ARE FROM BME COM...
RECIPIENT_ACTIVITIES
______________________________
funder_num funder_name grant_id recipient_name word_count_recipient_activities recipient_activities
Minimum 283813 LONDON MARATHON CHARITABLE TRUST LIMITED 360G-LMCT-68_2023 EDMONTON COMMUNITY PARTNERSHIP 1.00 .
Maximum 1091263 LONDON COMMUNITY FOUNDATION 360G-LondonCF-A561932 LEWISHAM MULTILINGUAL ADVICE SERVICE 500.00 LEWISHAM MULTILINGUAL ADVICE SERVICE (LMLAS) I...
#low word counts - count and view short grant details from 360Giving API/CC public extract
grant_details = {
    "grant_title": "Short grant title",
    "grant_desc": "Short grant description",
    "recipient_activities": "Short recipient activities"
}

for col, title in grant_details.items():
    grants_df[f"short_{col}_section"] = (
        (grants_df[col].str.split().str.len() < 3)
    )
    short_grant_details = grants_df[grants_df[f"short_{col}_section"]][col].dropna().unique()
    
    # print(f"\n{title}: {grants_df[f'short_{col}_section'].sum():,}")
    # print_in_rows(short_grant_details, 12)
#high word counts - check top 10 longest grant details (to manually check)
long_grant_title = get_longest_values(grants_df, "word_count_grant_title", "grant_id")
long_grant_desc = get_longest_values(grants_df, "word_count_grant_desc", "grant_id")
long_recip_activities = get_longest_values(grants_df, "word_count_recipient_activities", "grant_id")

sections = [
    ("grant_title", long_grant_title),
    ("grant_desc", long_grant_desc),
    ("recipient_activities", long_recip_activities)
]

for name, longest in sections:
    print(f"Top 10 longest {name}: {longest}")
Top 10 longest grant_title: ['360G-Ellerman-18741', '360G-Ellerman-18295', '360G-Ellerman-17745', '360G-Ellerman-18026', '360G-LMCT-07_2021', '360G-Ellerman-20290', '360G-Ellerman-20116', '360G-Ellerman-19350', '360G-Ellerman-18035', '360G-LMCT-18_2015']
Top 10 longest grant_desc: ['360G-LondonCF-A583241', '360G-PeoplesHealthTrust-2023_137', '360G-PeoplesHealthTrust-2023_39', '360G-PeoplesHealthTrust-2023_24', '360G-PeoplesHealthTrust-2014_176', '360G-PeoplesHealthTrust-2016_200', '360G-PeoplesHealthTrust-2023_25', '360G-PeoplesHealthTrust-2023_14', '360G-PeoplesHealthTrust-2014_192', '360G-PeoplesHealthTrust-2013_41']
Top 10 longest recipient_activities: ['360G-LondonCF-A584528', '360G-LondonCF-A561932', '360G-LondonCF-A584264', '360G-LondonCF-A575782', '360G-LondonCF-A566462', '360G-LondonCF-A585091', '360G-LondonCF-A583159', '360G-LondonCF-A567136', '360G-LondonCF-A561466', '360G-LondonCF-A565064']

Exploration of Findings from Word Count Analysis (Grants)

The word count analysis on the grants dataframe shows that there are almost 2,000 entries with a length of fewer than three words. Having inspected these, I can see that there are a small number of unusual values that need to be cleaned, such as an entry in recipient_activities that reads #NAME?. There are also several hundred grant titles that appear to be IDs as opposed to descriptive titles (e.g. VFTF R6-SOLACE-2024); these will not lend themselves to embedding, so I will remove them. They all appear to end in a dash and a year, so can be identified by “-2”.

In terms of high word counts, there are no concerns. As with the information called from the API in the funders dataframe, the longest grant details are simply relatively long and wordy. I am confident that the information from the 360Giving API is highly reliable, with a very small minority of unexpected values that can be cleaned easily.


3. Quality of Data from APIs

Although the data that has been returned from the APIs is generally clean and reliable, there are some minor issues that require tidying.

Problems Identified by Word Count Analyses

#replace ID grant titles with None
grants_df.loc[grants_df["grant_title"].str.contains("-2", na=False), "grant_title"] = None
#replace unusual values with None
unusual_values = ["#NAME?", "NIL ACTIVITY.", ".", "AS BEFORE", "IN LIQUIDATION", "NONE", "0", "N/A", "-"]
grants_df["grant_desc"] = grants_df["grant_desc"].replace(unusual_values, None)
grants_df["recipient_activities"] = grants_df["recipient_activities"].replace(unusual_values, None)
#remove html entities from activities
grants_df["recipient_activities"] = grants_df["recipient_activities"].str.replace("^&#X2;", "", regex=True, case=False)
#clean start of activities strings
grants_df["recipient_activities"] = grants_df["recipient_activities"].str.replace("^[^a-zA-Z0-9]+", "", regex=True)
#print grant details again to confirm cleaning has been successful
grant_details = {
    "grant_title": "Short grant title",
    "grant_desc": "Short grant description",
    "recipient_activities": "Short recipient activities"
}

for col, title in grant_details.items():
    grants_df[f"short_{col}_section"] = (
        (grants_df[col].str.split().str.len() < 3)
    )
    short_grant_details = grants_df[grants_df[f"short_{col}_section"]][col].dropna().unique()
    
    # print(f"\n{title}: {grants_df[f'short_{col}_section'].sum():,}")
    # print_in_rows(short_grant_details, 12)

Checking and Cleaning Other Variables (Funders)

#check funder names
name_lengths = funders_df["name"].str.len()

print(f"Funders with names < 10 characters: {(name_lengths < 10).sum()}")
print(f"Funders with names > 75 characters: {(name_lengths > 75).sum()}")

print("\nTop 10 shortest funder names (< 10 characters)")
short_names = funders_df[name_lengths < 10][["registered_num", "name", "website"]].sort_values("name", key=lambda x: x.str.len()).head(10)
display(short_names)

print("\nTop 10 longest funder names (> 75 characters)")
long_names = funders_df[name_lengths > 100][["registered_num", "name", "website"]].sort_values("name", key=lambda x: x.str.len(), ascending=False)
display(long_names)
Funders with names < 10 characters: 23
Funders with names > 75 characters: 1

Top 10 shortest funder names (< 10 characters)
registered_num name website
156 1160614 STAR https://www.starfoundation.org.uk
416 1161405 SEDCU https://www.sedcu.org.uk
559 1187731 JOSHA None
764 1082476 PRAGYA https://www.pragya.org
736 1172017 CLYMER None
578 1197149 RESEED https://reseed.org.uk
517 1087870 CMZ LTD None
783 1147360 LIFT UK https://www.liftuk.org
609 1190525 RHIZA UK https://rhizauk.uk/
897 1041449 MIRACLES https://www.miraclesthecharity.org

Top 10 longest funder names (> 75 characters)
registered_num name website
904 220487 CHARITY FOR THE REPARATION OF THE CHURCH AND S... None
#check funder websites
website_lengths = funders_df["website"].str.len()

print(f"Funders with websites < 15 characters: {(website_lengths < 15).sum()}")
print(f"Funders with websites > 75 characters: {(website_lengths > 75).sum()}")

print("\nTop 10 shortest funder websites (< 15 characters)")
short_websites = funders_df[website_lengths < 15][["registered_num", "name", "website"]].sort_values("website", key=lambda x: x.str.len()).head(10)
display(short_websites)

print("\nTop 10 longest funder websites (> 75 characters)")
long_websites = funders_df[website_lengths > 75][["registered_num", "name", "website"]].sort_values("website", key=lambda x: x.str.len(), ascending=False)
display(long_websites)
Funders with websites < 15 characters: 6
Funders with websites > 75 characters: 2

Top 10 shortest funder websites (< 15 characters)
registered_num name website
839 801622 SUSAN AND STEPHEN JAMES CHARITABLE SETTLEMENT
616 1126336 ANAV CHARITABLE TRUST
767 1069996 SUPPORT TIFFIN GIRLS' SCHOOL COMPANY
266 246116 CLERGY WIDOWS' AND ORPHANS' FUND
272 1138049 PERU PEOPLE
159 287404 LIBRARY SERVICES TRUST https://L.S.T

Top 10 longest funder websites (> 75 characters)
registered_num name website
955 200693 EVERSHOLT PAROCHIAL CHARITY https://www.eversholtvillage.co.uk/history-fac...
51 273057 WYMONDHAM COLLEGE PARENT-STAFF ASSOCIATION https://www.wymondhamcollege.org/home/partners...
#reassign null to incorrectly stored urls
funders_df.loc[website_lengths < 15, "website"] = None
#remove leading characters from paragraph columns
funders_df["activities"] = funders_df["activities"].apply(clean_start_of_text)
funders_df["objectives"] = funders_df["objectives"].apply(clean_start_of_text)

Checking and Cleaning Other Variables (Grants)

#check recipient ids - view ids with unexpected starting characters
invalid_ids = grants_df[~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P"), na=False)]["recipient_id"].unique()
# print(list(invalid_ids))
#store original ids
grants_df["original_recipient_id"] = grants_df["recipient_id"].copy()

#remove incorrect spaces and 0s
grants_df["recipient_id"] = grants_df["recipient_id"].astype(str).str.strip()
grants_df["recipient_id"] = grants_df["recipient_id"].str.lstrip("0")

#reassign remaining invalid ids
invalid_ids = ~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P"), na=False)
grants_df.loc[invalid_ids, "recipient_id"] = [f"invalid_{i}" for i in range(invalid_ids.sum())]

#map old and new ids
id_mapping = grants_df[grants_df["recipient_id"] != grants_df["original_recipient_id"]][["original_recipient_id", "recipient_id"]].drop_duplicates()

#check result
invalid_ids = grants_df[~grants_df["recipient_id"].str.startswith(("1", "2", "3", "4", "5", "6", "7", "8", "9", "P", "i"), na=False)]["recipient_id"].unique()
print(f"Invalid IDs remaining: {list(invalid_ids)}")
Invalid IDs remaining: []
#remove leading characters from paragraph columns
grants_df["recipient_activities"] = grants_df["recipient_activities"].apply(clean_start_of_text)

Explanation of Cleaning Process

Funder Names and Websites

I am satisfied that there are no concerns with the length of funder names. Looking at the shortest websites, there appeared to be one invalid URL, and five that are stored in an incorrect null format; I reassigned all six of these to None.

Recipient IDs

Some values in recipient_id need to be cleaned. There is a small subset with incorrect starting characters, and several that are not registered charity numbers. Working with organisations that are not registered with the Charity Commission (of England & Wales) is out of the scope of this project. Where there is clearly an error in the Charity Commission registered number, I have cleaned these values. I have removed spaces and 0s from the start of IDs and reassigned any other values (e.g. N/A or Exempt) to show that the ID was invalid. Where recipient_id is a company number or a registered number from Scotland or Northern Ireland, I have also assigned an “invalid” ID. The text content of these records can be embedded as-is, but anything else (e.g. calling a different API) is out of the scope of the project so no additional value will be gleaned from keeping the original IDs.


4. Quality of Extracted Sections and Grants

Sections

To extract sections (i.e activities_objectives, achievements_performance and grants_policy), I have relied on regex matching. Unfortunately, as observed during the Word Count Analysis, there are noteable errors and quality issues that would degrade the quality of embeddings and undermind the reliability of semantic similarity comparisons. As such, they cannot be ignored. The grants have been extracted using the Claude API (using the Haiku 3 model), which has resulted in some errors but not to the same extent, and not as irreperable. I will reprocess the PDFs by running them through the Claude API, to improve the chances of the sections being correctly and consistently extracted.

#load reprocessed sections
sections_df = pd.read_csv("./6.2_sections_reprocessor/llm_sections.csv")
sections_df.head()
registered_num year_processed objectives_activities achievements_performance grant_policy error
0 272161 2024 The trustees are directed by the trust deed to... The Trust generated £261,918 (2023: £259,731) ... The published policy of the trustees is to hel... NaN
1 211715 2024 The object of the Trust is to relieve persons ... During the year the Grants Sub-Committee revie... The Trustees consider applications for grants ... NaN
2 1007307 2024 The objective of the charity is to provide fin... NaN NaN NaN
3 235891 2024 The charity is controlled by its governing doc... NaN NaN NaN
4 803686 2025 The charity is a non-profit seeking charitable... The charity's achievements during the period i... When considering applications for grants, the ... NaN
#check efficacy of sections reprocessor
sections_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 295 entries, 0 to 294
Data columns (total 6 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   registered_num            295 non-null    int64 
 1   year_processed            295 non-null    int64 
 2   objectives_activities     276 non-null    object
 3   achievements_performance  263 non-null    object
 4   grant_policy              154 non-null    object
 5   error                     5 non-null      object
dtypes: int64(2), object(4)
memory usage: 14.0+ KB
#view 10% random selection of each reprocessed section
section_columns = ["objectives_activities", "achievements_performance", "grant_policy"]

for col in section_columns:
    #get sample of sections with values
    non_null = sections_df[sections_df[col].notna()]
    sample_size = int(len(non_null) * 0.1)
    sample = non_null.sample(n=sample_size, random_state=42)

    #display sections to spotcheck
    print(f"\n{col.upper()}")
    print(f"Total: {len(non_null)}\nSample: {sample_size}\n")

    # for idx, (i, row) in enumerate(sample.iterrows(), 1):
    #     print(f"---Sample {idx} (Funder {row['registered_num']})---")
    #     print(row[col])
    #     print()

OBJECTIVES_ACTIVITIES
Total: 276
Sample: 27


ACHIEVEMENTS_PERFORMANCE
Total: 263
Sample: 26


GRANT_POLICY
Total: 154
Sample: 15
#drop sections columns from funder df
sections_cols = ["objectives_activities", "achievements_performance", "grant_policy"]
funders_df = funders_df.drop(columns=sections_cols, errors="ignore")
sections_df["registered_num"] = sections_df["registered_num"].astype(str)

#make sections uppercase
for col in sections_cols:
    sections_df[col] = sections_df[col].str.upper()

#merge reprocessed sections back into funders df
funders_df = funders_df.merge(
    sections_df[["registered_num"] + sections_cols],
    on="registered_num",
    how="left"
)

#confirm merge
print(f"objectives_activities: {funders_df['objectives_activities'].notna().sum()}, \
achievements_performance: {funders_df['achievements_performance'].notna().sum()}, \
grant_policy: {funders_df['grant_policy'].notna().sum()}")
objectives_activities: 276, achievements_performance: 263, grant_policy: 154

Exploration of Reprocessed Sections

The output from the .info() method shows that more funders now have values for the extracted sections - increasing from 52 to 154 funders with grant_policy, from 193 to 263 funders with achievements_peformance and from 229 to 276 funders with objectives_activities. The random spotcheck of 10% of the values for each section has revealed that the quality is much higher. Other than some small spelling errors (e.g. Uttar Grades instead of Uttar Pradesh), there do not seem to be extensive quality issues in these sections that could severely impact the quality of embeddings. The fact that I have noted issues with extracted grants (which have only ever been extracted using Claude) demonstrates that using an LLM to extract text from PDFs cannot be relied upon as a perfect method; however, given its substantial improvement over regex matching (and the absence of more robust extraction methods within the project’s constraints), I consider it to be the most appropriate solution available.

Grants - Names

The messiest part of the extracted grants data is recipient_name, which has been built from two sources. The first is the Charity Commission’s Public Extract, which is updated daily and includes registered charities’ names as logged with the Charity Commission. These names are generally clean and standardised, though charities occasionally use trading names that differ from their official registered names.

The second source is the organisation’s name as extracted from funders’ accounts by the Claude API. This is significantly less reliable and is a noted limitation of this project. The reliability is affected in part by the reliance on a large language model and its fuzzy intepretations of messy text (although this is clearly preferable to regex), but also due to the fact that funders’ accounts are not held to any standard of enforcement in terms of the correct spelling, uniformity, or indeed accuracy of recipients’ names. Examples that have been noted during the course of this project include inconsistency in pluralisation and spacing (e.g. hospices vs hospice, Tearfund vs Tear Fund); missing words (e.g. British Red Cross Society vs British Red Cross); errors in punctuation (e.g. soldiers’ vs solider’s); and inconsistency in word order (e.g. Lincoln University vs University of Lincoln). These inconsistencies mean that grants to the same recipient may appear under multiple name variants in the database. However, as the current system focuses on funder-applicant alignment rather than recipient-level analysis these naming inconsistencies will not impact the core functionality and this remains an area for future development.

Human Recipients of Grants

As some funders give grants to individuals as well as organisations, Claude API has extracted some human names from the accounts. This is publicly available data but, even so, I do not feel comfortable storing people’s names in my database - plus, it will add no value to the project. As such, I will attempt to flag the human records for removal.

#flag records that might be human names
extracted_recipients = grants_df[grants_df["recipient_id"].str.startswith("PDF", na=False)].copy()
extracted_recipients["is_potential_human"] = extracted_recipients["recipient_name"].apply(check_names)
flagged_as_human = extracted_recipients[extracted_recipients["is_potential_human"]]["recipient_name"].unique()

print(f"Total unique extracted recipients: {extracted_recipients['recipient_name'].nunique():,}")
print(f"Names flagged as potentially human: {len(flagged_as_human):,}")

#add as column to grants_df
grants_df["is_potential_human"] = False
grants_df.loc[grants_df["recipient_id"].str.startswith("PDF", na=False), "is_potential_human"] = \
    grants_df.loc[grants_df["recipient_id"].str.startswith("PDF", na=False), "recipient_name"].apply(check_names)
Total unique extracted recipients: 6,865
Names flagged as potentially human: 552
grants_df.sort_values("is_potential_human", ascending=False).head(2)
grant_title grant_desc amount year grant_id source funder_num funder_grants_id funder_name recipient_id recipient_grants_id recipient_name recipient_activities recipient_areas recipient_causes recipient_beneficiaries amount_gbp word_count_grant_title word_count_grant_desc word_count_recipient_activities short_grant_title_section short_grant_desc_section short_recipient_activities_section original_recipient_id is_potential_human
20615 None None 1218.04 2023 2023_226743_24 Accounts 226743 59728 KNARESBOROUGH RELIEF IN NEED CHARITY PDF-000185 58539 AMANDA AGNEW None [] [] [] £1,218.04 NaN NaN NaN False False False PDF-000185 True
20619 None None 921.50 2023 2023_226743_28 Accounts 226743 59732 KNARESBOROUGH RELIEF IN NEED CHARITY PDF-000189 58543 LAURA ANDERSON None [] [] [] £921.50 NaN NaN NaN False False False PDF-000189 True
#remove records
grants_df = grants_df[~((grants_df["funder_name"] == "KNARESBOROUGH RELIEF IN NEED CHARITY") &
                          (grants_df["is_potential_human"] == True))]
#delete rows where recipient_name starts with a number
grants_df = grants_df[~grants_df["recipient_name"].str.match(r'^\d', na=False)]

Grants - Duplication

Charity accounts follow a comparative reporting format where they present figures for both the current financial year and the previous financial year side by side. This means that the grants that have been extracted by the LLM are likely to be duplicated in some cases. I will identify duplicates by comparing the funder, recipient, grant amount and years. Where the exact same funder-recipient-amount combination is found in two consecutive years this is likely to be a duplicate, so I will remove these instances.

#sort grants extracted from accounts to spot duplicates
accounts_grants = grants_df[grants_df["source"] == "Accounts"].copy()
accounts_grants = accounts_grants.sort_values(["funder_num", "recipient_name", "amount", "year"])

#compare via shifted column
accounts_grants["prev_funder"] = accounts_grants["funder_num"].shift(1)
accounts_grants["prev_recipient"] = accounts_grants["recipient_name"].shift(1)
accounts_grants["prev_amount"] = accounts_grants["amount"].shift(1)
accounts_grants["prev_year"] = accounts_grants["year"].shift(1)

#mark as duplicate is same funder, recipient and amount but one year apart
accounts_grants["is_consecutive_dup"] = (
    (accounts_grants["funder_num"] == accounts_grants["prev_funder"]) &
    (accounts_grants["recipient_name"] == accounts_grants["prev_recipient"]) &
    (accounts_grants["amount"] == accounts_grants["prev_amount"]) &
    ((accounts_grants["year"] - accounts_grants["prev_year"]) == 1)
)

print(f"Found {accounts_grants['is_consecutive_dup'].sum()} consecutive year duplicates")
Found 1820 consecutive year duplicates
#remove duplicated grants and combine back with other grants
accounts_deduped = accounts_grants[~accounts_grants["is_consecutive_dup"]].drop(
    columns=["prev_funder", "prev_recipient", "prev_amount", "prev_year", "is_consecutive_dup"]
)
g360_grants = grants_df[grants_df["source"] == "360Giving"]
grants_df = pd.concat([g360_grants, accounts_deduped], ignore_index=True)

print(f"Removed {len(accounts_grants) - len(accounts_deduped)} duplicate grants")
print(f"New total grants: {len(grants_df):,}")
Removed 1820 duplicate grants
New total grants: 30,955

Grants - Amounts

As explored during the missingness analysis, some amounts are null and I have chosen to retain these. I will also sense check very small and very large amounts to confirm that they are indeed correct. I will exclude grants with values of £0 because these are typically accounting adjustments, regranted funds, or administrative entries rather than nulls or actual grant awards (360Giving, n.d.).

#get grants less than £100 except where this is expected
low_value_grants = grants_df[(grants_df["amount"] < 100) & (grants_df["amount"] > 0) & \
    (~grants_df["funder_name"].str.startswith("ROTARY", na=False)) & \
        (~grants_df["funder_name"].str.startswith("JAMES", na=False))]
#multiply by 1000 where relevant
multiply_000s_funders = ["1060133", "255281"]
for fun_num in multiply_000s_funders:
    grants_df.loc[(grants_df["funder_num"] == fun_num) & (grants_df["amount"] < 100), "amount"] = \
        grants_df.loc[(grants_df["funder_num"] == fun_num) & (grants_df["amount"] < 100), "amount"] * 1000

#reassign where amount is incorrect
grants_df.loc[(grants_df["funder_num"] == "298083") & (grants_df["amount"] == 1), "amount"] = None
grants_df.loc[(grants_df["grant_id"] == "2023_1045479_1"), "amount"] = 278.00

#recalculate gbp amounts
grants_df["amount_gbp"] = grants_df["amount"].apply(add_gbp_columns)

#drop rows with human names
grants_df = grants_df[(grants_df["recipient_id"] != "PDF-000179") & (grants_df["recipient_id"] != "PDF-000180")]
low_value_cleaned = grants_df[(grants_df["amount"] < 100) & (grants_df["amount"] > 0) & \
    (~grants_df["funder_name"].str.startswith("ROTARY", na=False)) & \
        (~grants_df["funder_name"].str.startswith("JAMES", na=False))]

low_value_cleaned.head(5)
grant_title grant_desc amount year grant_id source funder_num funder_grants_id funder_name recipient_id recipient_grants_id recipient_name recipient_activities recipient_areas recipient_causes recipient_beneficiaries amount_gbp word_count_grant_title word_count_grant_desc word_count_recipient_activities short_grant_title_section short_grant_desc_section short_recipient_activities_section original_recipient_id is_potential_human
18912 None None 14.00 2021 2021_1045479_1 Accounts 1045479 64186 BARKER-MILL FOUNDATION PDF-002489 62997 PAROCHIAL CHURCH COUNCIL OF THE ECCLESIASTICAL... None [] [] [] £14.00 NaN NaN NaN False False False PDF-002489 False
22513 None None 30.00 2020 2020_222652_14 Accounts 222652 62270 ROCHDALE FUND FOR RELIEF IN SICKNESS PDF-001568 61081 INJURED JOCKEY'S FUND None [] [] [] £30.00 NaN NaN NaN False False False PDF-001568 False
22635 None None 51.00 2021 2021_223903_18 Accounts 223903 62780 CLEEVES AND WHITEHEAD TRUST PDF-001805 61591 CARL WHITEHEAD LODGE None [] [] [] £51.00 NaN NaN NaN False False False PDF-001805 False
22842 None None 25.00 2021 2021_224590_1 Accounts 224590 62504 SHAW LANDS TRUST PDF-001638 61315 BARNSLEY & DISTRICT GENERAL HOSPITAL None [] [] [] £25.00 NaN NaN NaN False False False PDF-001638 False
22843 None None 35.00 2020 2020_224590_1 Accounts 224590 62544 SHAW LANDS TRUST PDF-001638 61355 BARNSLEY & DISTRICT GENERAL HOSPITAL None [] [] [] £35.00 NaN NaN NaN False False False PDF-001638 False

Explanation of Grants Cleaning Process

There are 6,865 grants in the database that have been extracted from PDFs, but using the names dataset I was able to filter this to 552 flagged as potentially containing human names. I could immediately see that the main culprit was a funder called KNARESBOROUGH RELIEF IN NEED CHARITY, which has named its individual recipients in its accounts. For the sake of reproducibility, I will programmatically remove records from this funder and with only two words - but for a more thorough removal it would be best to inspect each PDF-extracted grant record manually.

The deduplication process keeps the earliest year of each grant, as the earlier accounts show the actual grant, and later accounts are just showing “prior year” comparisons. In terms of functionality, this means that prospie will be able to inform the user, for example, ‘this funder gave £1,000 to XYZ Charity in 2023’.

Regarding cleaning the grants amounts data, I checked for values lower than £100 as this is a relatively unusual value for a grant. This showed that some funders (such as Rotary Clubs) can be expected to give low value grants, as this is how their distribution model works. Around 60% of the remaining low value were attributable to grants being reported in thousands of pounds (£000s) in the original accounts, e.g. a grant of £22,000 was recorded as £22, so I multiplied these values by 1000. The last transformation on grant amounts was for one funder whose accounts included the names of grant recipients but not values, and the LLM had mistakenly assigned an incorrect value of £1 to these grants. A spotcheck of the remaining low value suggests that these are accurate, with 0.14% of the grants in the dataset having a value of less than £100.


Descriptive Statistics

Summary Statistics

#build df of summary statistics
summary_data = make_summary_df(funders_df, grants_df)
summary_df = pd.DataFrame(summary_data)
summary_df["Value"] = summary_df.apply(format_stats, axis=1)
summary_df = format_df(summary_df)
Metric Value
Total funders 996
Funders with grants 309
Funders with grants from 360Giving 11
Total recipients 19,466
Total grants 30,716
Total grant value £1,428,662,939.67
Mean grants per funder 106
Most grants given by a funder 6,987
Fewest grants given by a funder 1
Mean recipients per funder 72.9
Mean areas per funder 3.6
Mean funder income £1,258,942.79
Median funder income £46,068.00
Mean funder expenditure £1,188,764.68
Median funder expenditure £43,765.50
Largest funder by mean income Save The Children Fund (£284,868,800.00)
Largest funder by mean expenditure Save The Children Fund (£281,939,194.60)
Mean grants per recipient 1.6
Range of grants received 1 to 79
Recipient of largest grant Patricia, Donald, Christine & Patrick Shepherd Foundation (£11,182,253.00) From Patricia And Donald Shepherd Charitable Trust
Mean grant size £46,587.85
Median grant size £11,791.50
Standard deviation £148,858.47
Smallest grant £14.00
Largest grant £11,182,253.00
Earliest grant 2001

Calculated Statistics

#get calculated stats and build df
stats = calculate_stats(funders_df, grants_df)
calculated_data = make_calculated_df(stats)
calculated_df = pd.DataFrame(calculated_data)
calculated_df["Value"] = calculated_df.apply(format_stats, axis=1)
calculated_df = format_df(calculated_df)
Metric Value
Share of grants from top 10% funders (by income) 71.6%
Share of grants to top 10% recipients (by grant value) 71.5%
Percent of recipients with multiple grants from same funder 21.5%
Average grants per funder-recipient pair 1
Mean grants-to-income ratio 119.6%
Median grants-to-income ratio 57.6%
Percent of funders supporting General Charitable Purposes only 16.8%
Percent of extracted grants matched with recipient data 23.2%

Exploration of Giving Patterns and Funder Characteristics

1. Categorical Analysis

Classification Popularity

One of the key objectives of prospie is to help fundraisers navigate the confusing trusts landscape, which is confusing largely due to the mismatch of information that is available. Funders may indicate a particular cause or area of activity, but these do not always align with their actual giving habits. I will therefore compare the classifications that funders state (their identified causes, beneficiaries, and areas of activity) with those of the recipients who are awarded their grants.

#display plots to compare funders' and recipients' classifications
fig, axes = plt.subplots(3, 2, figsize=(18, 15))

#areas
make_bar_chart(grants_df, "recipient_areas", "Recipients: Most Popular Areas", color=colours[1], ax=axes[1, 0])
make_bar_chart(funders_df, "areas", "Funders: Most Popular Areas", color=colours[1], ax=axes[1, 1])

#beneficiaries
make_bar_chart(grants_df, "recipient_beneficiaries", "Recipients: Most Popular Beneficiaries", color=colours[2], ax=axes[2, 0])
make_bar_chart(funders_df, "beneficiaries", "Funders: Most Popular Beneficiaries", color=colours[2], ax=axes[2, 1])

#causes
make_bar_chart(grants_df, "recipient_causes", "Recipients: Most Popular Causes", color=colours[0], ax=axes[0, 0])
make_bar_chart(funders_df, "causes", "Funders: Most Popular Causes", color=colours[0], ax=axes[0, 1])

plt.tight_layout()
plt.subplots_adjust(hspace=0.4)
plt.show()

Relationships between Categories

#explode classification columns
exploded_classifications = grants_df[["recipient_causes", "recipient_beneficiaries"]].copy()
exploded_classifications = exploded_classifications.explode("recipient_causes").explode("recipient_beneficiaries")
exploded_classifications = exploded_classifications.dropna()

#create crosstab
crosstab = pd.crosstab(exploded_classifications["recipient_causes"], exploded_classifications["recipient_beneficiaries"])

#create heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(crosstab, cmap="Oranges", cbar_kws={"label": "Number of Grants"})
plt.title("Causes vs Beneficiaries Heatmap", fontsize=14, fontweight="bold")
plt.xlabel("Beneficiaries")
plt.ylabel("Causes")
plt.xticks(rotation=45, ha="right")
plt.yticks(rotation=0)
plt.tight_layout()
plt.show()

Diversity of Giving Across Categories

#count unique categories per funder
funder_diversity = grants_df.groupby("funder_num").agg({
    "recipient_areas": lambda x: len(set([a for areas in x for a in areas])),
    "recipient_beneficiaries": lambda x: len(set([b for bens in x for b in bens])),
        "recipient_causes": lambda x: len(set([c for causes in x for c in causes]))
}).rename(columns={"recipient_areas": "unique_areas",
                    "recipient_beneficiaries": "unique_beneficiaries",
                    "recipient_causes": "unique_causes",})

#create histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

categories = [
    ("unique_areas", "Areas", colours[1]),
    ("unique_beneficiaries", "Beneficiaries", colours[2]),
    ("unique_causes", "Causes", colours[0])
]

for idx, (col, label, colour) in enumerate(categories):
    #remove recipients with missing classifications
    diversity_filtered = funder_diversity[funder_diversity[col] > 0]
    
    if col == "unique_areas":
        axes[idx].hist(diversity_filtered[col], bins=20, color=colour)
    else:
        value_counts = diversity_filtered[col].value_counts().sort_index()
        axes[idx].bar(value_counts.index, value_counts.values, color=colour)
        axes[idx].xaxis.set_major_locator(plt.MaxNLocator(integer=True))
    
    axes[idx].set_title(f"Diversity of Giving - {label}")
    axes[idx].set_xlabel(f"Number of Unique {label} Funded")
    axes[idx].set_ylabel("Number of Funders")
    axes[idx].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

Categorical Match Rates

#add funder classifications to grants_df
grants_funders_merge = grants_df.merge(
    funders_df[["registered_num", "causes", "areas", "beneficiaries"]],
    left_on="funder_num",
    right_on="registered_num",
    how="left",
    suffixes=("", "_funder")
)

#find overlaps to identify matches (only where recipients have classifications)
for category in ["areas", "beneficiaries", "causes"]:
    grants_funders_merge[f'{category}_match'] = grants_funders_merge.apply(lambda row: check_overlap(row[category], row[f'recipient_{category}']), axis=1)

with_classifications = grants_funders_merge[
    (grants_funders_merge["recipient_areas"].str.len() > 0) &
    (grants_funders_merge["recipient_beneficiaries"].str.len() > 0) &
    (grants_funders_merge["recipient_causes"].str.len() > 0)
]

#calculate funder match rates
funder_match_rates = with_classifications.groupby("funder_num").agg({
    "areas_match": ["sum", "mean"],
    "beneficiaries_match": ["sum", "mean"],
    "causes_match": ["sum", "mean"],
    "grant_id": "count"
}).reset_index()

funder_match_rates.columns = ["funder_num", "areas_matches", "areas_match_rate", "beneficiaries_matches", "beneficiaries_match_rate", "causes_matches", "causes_match_rate", 
"total_grants"]

#add names
funder_match_rates = funder_match_rates.merge(
    funders_df[["registered_num", "name"]],
    left_on="funder_num",
    right_on="registered_num"
)

#create histograms
fig, axes = plt.subplots(1, 3, figsize=(18, 5))

for idx, category in enumerate(["areas", "beneficiaries", "causes"]):
    col = f"{category}_match_rate"
    mean_val = funder_match_rates[col].mean()

    axes[idx].hist(funder_match_rates[col], bins=20, color=colours[idx])
    axes[idx].axvline(mean_val, color="red", linestyle="--", linewidth=2,
                    label=f"Mean: {mean_val:.1%}")
    axes[idx].set_xlabel("Match Rate")
    axes[idx].set_ylabel("Number of Funders")
    axes[idx].set_title(f"{category.title()} Match Rate Distribution")
    axes[idx].legend()

plt.tight_layout()
plt.show()

Exploration of Findings from Categorical Analysis

It can be observed that funders often state an interest in general causes, areas and beneficiaries - many do not specify particular interests and state that they will consider applications from any area of the sector. Further analysis will be useful, particularly following the creation of embeddings, to understand the practical reality of their funding priorities, which may reveal implicit preferences or local biases not reflected in their published criteria, e.g. if ‘in reality, donations are completely earmarked for a few select beneficiaries’ (Lock and Robinson, 2023).

Whilst the heatmap shows the relationship between causes and beneficiaries across all grants, it is heavily skewed by high-volume funders like Esmée Fairbairn Foundation (6,000+ grants). This means that the patterns shown reflect individual funder preferences rather than sector-wide trends, so it does not offer us much useful insight or relevance to the alignment model. It would be more useful to analyse funder behaviour on an individual level, such as quantifying how well their stated preferences match with their actual giving.

Most funders concentrate their giving within a relatively narrow scope. There’s a long tail of funders who spread their giving more widely, but these are the exception. This clustering suggests that funders typically have quite focused areas of interest, even whe they state broad ranges in their registered details. I was unable to find an explicit explanation of the huge peak at 7 causes supported, but evidence suggests that Charity Commission classifications may have previously been limited to seven general causes, before a wider range of more precise causes were implemented at a later date.

Match rates show an interesting pattern - causes has the highest match rate at around 70%, followed by beneficiaries at 61%, whilst areas sits lowest at 45%. However, this doesn’t mean causes are more important - it’s actually the opposite. Causes are often incredibly generic (many charities state that they work in vague areas such as ‘education’ or ‘health’), so high match rates don’t tell us much. Areas, on the other hand, can be specific right down to town level, so a match there is much more meaningful. This has important implications for the weighting - areas need to be weighted more highly to reflect their specificity, and keyword matching will need careful calibration to avoid over-scoring when generic terms like ‘education’ appear on both sides.

A limitation of undertaking the analysis at this stage is missing recipient data; only 23.2% of grants that have been extracted from accounts are matched with a recipient (with data added by the recipients_table_builder). This means that around 3/4 of recipients do not have classifications data to analyse. This analysis will likely be more insightful once the embeddings data is available and semantic analysis can be performed on recipient_name and funders’ details.


2. Temporal and Financial Analysis

There is a disparity in the number of grants from each data source (i.e. the 360Giving API and extracted from accounts); the API returns historical data for this sample of funders reaching back to 2001, whereas accounts downloaded from the Charity Commission website date back only five years - 2020 at the point of writing. As such, there is a notable increase in the number of grants from 2020. This is a limitation to the strength of the analysis of giving volume, but it is still possible to explore patterns in grant amounts and other factors over time.

#set up analysis dfs
grants_source_yr = grants_df.groupby(["year", "source"]).agg({
    "grant_id": "count",
    "amount": ["sum", "mean"]
}).reset_index()
grants_source_yr.columns = ["year", "source", "num_grants", "total_amount", "avg_amount"]

grants_yr = grants_df.groupby("year").agg({
    "grant_id": "count",
    "amount": ["sum", "mean", "median"]
}).reset_index()
grants_yr.columns = ["year", "num_grants", "total_amount", "avg_amount", "median_amount"]

grants_over_1 = grants_df[grants_df["amount"] > 0].copy()
years_unique = sorted(grants_over_1["year"].unique())
data_by_year = [grants_over_1[grants_over_1["year"] == year]["amount"].values for year in years_unique]

Number of Grants

#pivot to look at grants by source and year
grants_pivot = grants_source_yr.pivot(index="year", columns="source", values="num_grants").fillna(0)

#create stacked bar chart
fig, ax = plt.subplots(figsize=(12, 6))
ax.stackplot(grants_pivot.index,
            grants_pivot["360Giving"],
            grants_pivot["Accounts"],
            labels=["360Giving", "Accounts"],
            colors=[colours[0], colours[1]],
            alpha=0.7)

ax.set_title("Number of Grants by Source Over Time", fontsize=14, fontweight="bold")
ax.set_ylabel("Number of Grants")
ax.legend(loc="upper left")
ax.grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

Values of Grants

#create line chart
fig, ax = plt.subplots(figsize=(12, 6))

ax.plot(grants_yr["year"], grants_yr["total_amount"], marker="o", linewidth=2, color=colours[2], label="Total")
for source in grants_source_yr["source"].unique():
    data = grants_source_yr[grants_source_yr["source"] == source]
    color = colours[0] if source == "360Giving" else colours[1]
    ax.plot(data["year"], data["total_amount"], marker=".", linewidth=1.5, color=color, alpha=0.4, linestyle="--", label=source, zorder=2)
ax.set_title("Total Amount Awarded Over Time", fontsize=14, fontweight="bold")
ax.set_ylabel("£ (millions)")
ax.legend(loc="upper left")
ax.grid(True, alpha=0.3)
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: f'{x/1_000_000}'))

plt.tight_layout()
plt.show()

Distribution of Grants

#create boxplot
fig, ax = plt.subplots(figsize=(12, 6))
bp = ax.boxplot(data_by_year, labels=years_unique, showfliers=True)

#set log scale
ax.set_yscale("log")

ax.set_title("Distribution of Grant Values", fontsize=14, fontweight="bold")
ax.set_ylabel("Grant Value (£, log10)")
ax.grid(True, alpha=0.3, axis="y")
ax.yaxis.set_major_formatter(FuncFormatter(lambda x, p: f'£{int(x):,}'))

plt.tight_layout()
plt.show()

Exploration of Findings from Temporal and Financial Analysis

In the Number of Grants chart, the sharp increase from 2020 mostly reflects the data collection methodology rather than a genuine surge in grant-making - this is when the accounts data starts. Before 2020, we’re relying solely on 360Giving data which is patchier. However, the pandemic also dramatically affected grant-making, and this is the other explanatory factor for the sharp increase. The balance between the two sources shows that 360Giving provides good historical coverage back to the early 2000s, but accounts extraction is essential for capturing recent activity from funders who don’t publish to 360Giving.

In terms of giving values, there is notable year-on-year variation, with notable peaks in 2008, 2017, and 2021-22 and a general increase over time. The accounts data provides more stable coverage from 2020 onwards, but it is important to note that fluctuations in reporting (requirements) are a significant source of uncertainty. Grant values follow a log-normal distribution - most grants cluster in the £1,000-£10,000 range, but there’s a substantial tail of larger grants stretching into the hundreds of thousands.

The year-on-year consistency in the distribution suggests that typical grant sizes remain relatively stable over time, despite fluctuations in total volumes. The outliers above the whiskers represent major grants, often to large established charities or capital projects. Interestingly, the largest “grant” in the dataset was a transfer of funds from one foundation to its sister trust, representing a redistribution of assets rather than a true donation.


Exporting Dataframes back to Supabase

Table and Relationships Setup

#get final funder table
funders_table = funders_df[[
    "registered_num",
    "name",
    "website",
    "activities",
    "objectives",
    "achievements_performance",
    "grant_policy",
    "objectives_activities",
    "income_latest",
    "expenditure_latest",
    "is_on_list",
    "is_potential_sbf"
]]

#get final recipients table
recipients_table = grants_df[[
    "recipient_id",
    "recipient_name",
    "recipient_activities"
]].drop_duplicates(subset=["recipient_id"])

#add check for cleaned recipient ids
recipients_table["has_cleaned_id"] = recipients_table["recipient_id"].isin(id_mapping["recipient_id"].values)

#get final grants table
grants_table = grants_df[[
    "grant_id",
    "grant_title",
    "grant_desc",
    "amount",
    "year",
    "source"
]]
#dictionary to hold tables and their keys
tables = {
    "funders": (funders_table, "registered_num"),
    "grants": (grants_table, "grant_id"),
    "recipients": (recipients_table, "recipient_id")
}

Updated Recipient IDs - Join Tables

supabase = create_client(url, key)

#get old ids
old_recip_ids = id_mapping["original_recipient_id"].tolist()

if len(old_recip_ids) > 0:

    #remvoe from join tables
    join_tables = ["recipient_grants", "recipient_areas", "recipient_beneficiaries", "recipient_causes"]
    
    for table in join_tables:
        try:
            batch_size = 100
            for i in range(0, len(old_recip_ids), batch_size):
                batch = old_recip_ids[i:i+batch_size]
                response = supabase.table(table).delete().in_("recipient_id", batch).execute()
        except Exception as e:
            print(f"Error deleting from {table}: {e}")
    
    #remove from recipients table
    try:
        batch_size = 100
        for i in range(0, len(old_recip_ids), batch_size):
            batch = old_recip_ids[i:i+batch_size]
            response = supabase.table("recipients").delete().in_("recipient_id", batch).execute()
    except Exception as e:
        print(f"Error deleting from recipients: {e}")
    
    print("Deletion successful")
else:
    print("No IDs to clean.")
Deletion successful
#make dictionary to map ids and unique ids
id_map_dict = dict(zip(id_mapping["original_recipient_id"], id_mapping["recipient_id"]))
recipient_ids = set(recipients_table["recipient_id"].values)

#update all join tables
updated_recipient_grants = update_join_table(recipient_grants, id_map_dict, recipient_ids)
updated_recipient_areas = update_join_table(recipient_areas, id_map_dict, recipient_ids)
updated_recipient_causes = update_join_table(recipient_causes, id_map_dict, recipient_ids)
updated_recipient_beneficiaries = update_join_table(recipient_beneficiaries, id_map_dict, recipient_ids)

#drop helper column
recipients_table = recipients_table.drop(columns=["has_cleaned_id"])

Piping Data Back to Supabase

#upsert main tables
pipe_to_supabase(funders_table, "funders", "registered_num", url, key)
pipe_to_supabase(grants_table, "grants", "grant_id", url, key)
pipe_to_supabase(recipients_table, "recipients", "recipient_id", url, key)

#upsert join tables
pipe_to_supabase(updated_recipient_grants, "recipient_grants", "recipient_grants_id", url, key)
pipe_to_supabase(updated_recipient_areas, "recipient_areas", "recipient_areas_id", url, key)
pipe_to_supabase(updated_recipient_causes, "recipient_causes", "recipient_cause_id", url, key)
pipe_to_supabase(updated_recipient_beneficiaries, "recipient_beneficiaries", "recipient_ben_id", url, key)

print("Data upserted successfully")
✓ Successfully upserted all 996 records to funders
✓ Successfully upserted all 30716 records to grants
✓ Successfully upserted all 19466 records to recipients
✓ Successfully upserted all 30831 records to recipient_grants
✓ Successfully upserted all 18341 records to recipient_areas
✓ Successfully upserted all 21319 records to recipient_causes
✓ Successfully upserted all 16843 records to recipient_beneficiaries
Data upserted successfully