def get_publishers(bibdata): # Find all the instances of publisher names publishers = [] xpath ="//dc:publisher//foaf:name/text()" publishers = bibdata.xpath(xpath, namespaces=namespaces) # Show some resultsprint("There are "+str(len(publishers)) +" instances of publishers mentioned in the dataset." )print("There are a total of "+str(len(set(publishers))) +" different publishers mentioned in the dataset." )return publishersdef most_frequent_publishers(publishers):# Count the occurrences, find the 10 most frequently mentioned publishers publishernames_counts = Counter(publishers) publishernames_counts =dict(sorted(publishernames_counts.items(), key =lambda item: item[1], reverse=True)[:20] ) columns = ["occurrences"] publishernames_counts = pd.DataFrame.from_dict( publishernames_counts, orient="index", columns=["count"]).reset_index().rename({"index" : "publisher"}, axis=1 )return publishernames_countsglobal publishernames_countspublishers = get_publishers(bibdata)publishernames_counts = most_frequent_publishers(publishers)
There are 37741 instances of publishers mentioned in the dataset.
There are a total of 5382 different publishers mentioned in the dataset.