def get_pubyears(bibdata):
# Setting things up
pubyears = []
# Find all the instances of publication dates
xpath = "//dc:date/text()"
pubyears = bibdata.xpath(xpath, namespaces=namespaces)
# Count the occurrences, find the 10 most frequently mentioned publication dates
pubyear_counts = Counter(pubyears)
pubyear_counts = dict(sorted(pubyear_counts.items(), reverse=False))
# Filter data to clean it
pubyear_counts = pd.DataFrame.from_dict(pubyear_counts, orient="index").reset_index().rename(mapper={"index":"year", 0 : "count"}, axis="columns") #pubyear_counts = pubyear_counts[pubyear_counts[0] == 1991]
pubyear_counts = pubyear_counts[pubyear_counts["year"].str.isnumeric()]
pubyear_counts.set_index("year", inplace=True)
# Remove erroneous years (to be corrected in the data)
pubyear_counts.drop(["0134", "0201", "0207", "22", "30", "42", "58", "76", "78", "20"], inplace=True)
# Remove less relevant years (optional, of course)
pubyear_counts.drop(["1815", "1834", "1891", "1932", "1945", "1957", "1961", "1969", "1973", "1974", "1975", "1978", "1979", "1981", "1982", "1983", "1984"], inplace=True)
pubyear_counts.reset_index(inplace=True)
#print(pubyear_counts.head())
# Display some key figures
print("There are " + str(len(pubyears)) + " instances of publication years mentioned in the dataset.")
print("There are " + str(len(pubyear_counts)) + " different years mentioned in the dataset.")
return pubyear_counts
def visualize_pubyears(pubyear_counts):
plt.figure(figsize=(12,8))
palette = sns.color_palette("tab20b", len(pubyear_counts)-1)
fig = sns.barplot(
data = pubyear_counts,
x="year",
y="count",
hue = "count",
legend = False,
palette = palette,
)
fig.set(title="Number of bibliographic entries per year")
fig.set_xticklabels(fig.get_xticklabels(), rotation=60)
plt.savefig(join("figures", "pubyear_counts.svg"), dpi=300)
pubyear_counts = get_pubyears(bibdata)
visualize_pubyears(pubyear_counts)