People

Most frequently occurring person names

This page provides information about person names, i.e. authors and editors.

Code

# === Imports === 

import re 
import seaborn as sns
from matplotlib import pyplot as plt
from os.path import join
from os.path import realpath, dirname
import os
from lxml import etree
from io import StringIO, BytesIO
from collections import Counter
import pandas as pd


# === Files and parameters === 

bibdatafile = join("data", "BIB18_Zotero-RDF_2024-01-02.rdf") 
#bibdatafile = join("data", "BIB18_Zotero-RDF_TEST.rdf") 


namespaces = {
    "foaf" : "http://xmlns.com/foaf/0.1/",
    "bib" : "http://purl.org/net/biblio#",
    "dc" : "http://purl.org/dc/elements/1.1/",
    "z" : "http://www.zotero.org/namespaces/export#",
    "rdf" : "http://www.w3.org/1999/02/22-rdf-syntax-ns#"
    }


bibdata = etree.parse(bibdatafile)

Code

def get_personnames(bibdata): 

    # Find all the instances of persons
    personnames = []
    xpath = "//foaf:Person"
    persons = bibdata.xpath(xpath, namespaces=namespaces)
    print("There are " + str(len(persons)) + " instances of the Element 'person' in the dataset.")

    # Get the names (full name or first name, last name) from each person
    for item in persons: 
        if len(item) == 1: 
            personname = item[0].text
            personnames.append(personname)
        elif len(item) == 2: 
            personname = item[0].text + ", " + item[1].text 
            personnames.append(personname)    
    print("There are " + str(len(Counter(personnames))) + " different person names in the dataset.")
    return personnames

global personnames
personnames  = get_personnames(bibdata)

There are 96424 instances of the Element 'person' in the dataset.
There are 30514 different person names in the dataset.

Visualization of the most frequent person names

These persons could be authors or editors of publications.

Code

def most_frequent_persons(personnames):
    # Count the occurrences, find the 10 most frequently mentioned publishers
    personnames_counts = Counter(personnames)
    personnames_counts = dict(
        sorted(personnames_counts.items(), 
        key = lambda item: item[1], reverse=True)[:20]
        )
    
    columns = ["count"]
    personnames_counts = pd.DataFrame.from_dict(
        personnames_counts, 
        orient="index", 
        columns=["count"]).reset_index().rename({"index" : "person"}, 
        axis=1
        )
    
    #print(personnames_counts)
    return personnames_counts


def visualize_personnames_counts(personnames_counts): 
    #print(publishernames_counts)
    plt.figure(figsize=(12,8))
    palette = sns.color_palette("tab20b", len(personnames_counts))
    fig = sns.barplot(
        data=personnames_counts, 
        y="person", 
        x="count", 
        hue = "count", 
        legend = False,
        orient='h', 
        palette=palette,
        )
    for i in fig.containers:
        fig.bar_label(i,)
    fig.set(title="Most frequently mentioned people")
    plt.tight_layout()
    plt.savefig(
        join("figures", "personnames_counts.svg"),
        dpi=300
        )


personnames_counts = most_frequent_persons(personnames)
visualize_personnames_counts(personnames_counts)