. ================================================== Analyzing ‘4TU.ResearchData’ statistics & visualizing website: https://delft-rcafe.github.io/home/Plotathon.html data: https://data.4tu.nl/private_datasets/fuCYKTarWe3ShLS3NQSqufXBpqHgAFHr_l0Lbi2APok groups: https://github.com/4TUResearchData/djehuty/blob/main/src/djehuty/backup/resources/groups.json
# Importing packages
#use_virtualenv("myenv")
library(rjson)
library(tidyverse)
Warning: package ‘tidyverse’ was built under R version 4.2.3Warning: package ‘ggplot2’ was built under R version 4.2.3Warning: package ‘tibble’ was built under R version 4.2.3Warning: package ‘tidyr’ was built under R version 4.2.3Warning: package ‘readr’ was built under R version 4.2.3Warning: package ‘purrr’ was built under R version 4.2.3Warning: package ‘dplyr’ was built under R version 4.2.3Warning: package ‘stringr’ was built under R version 4.2.3Warning: package ‘forcats’ was built under R version 4.2.3Warning: package ‘lubridate’ was built under R version 4.2.3── Attaching core tidyverse packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr 1.1.3 ✔ readr 2.1.4
✔ forcats 1.0.0 ✔ stringr 1.5.0
✔ ggplot2 3.4.3 ✔ tibble 3.2.1
✔ lubridate 1.9.3 ✔ tidyr 1.3.0
✔ purrr 1.0.2 ── Conflicts ───────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag() masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(ggplot2)
library(tidyr)
library(dplyr)
library(purrr)
library(reshape2)
Warning: package ‘reshape2’ was built under R version 4.2.3
Attaching package: ‘reshape2’
The following object is masked from ‘package:tidyr’:
smiths
library(data.table)
Warning: package ‘data.table’ was built under R version 4.2.3Registered S3 method overwritten by 'data.table':
method from
print.data.table
data.table 1.14.8 using 8 threads (see ?getDTthreads). Latest news: r-datatable.com
Attaching package: ‘data.table’
The following objects are masked from ‘package:reshape2’:
dcast, melt
The following objects are masked from ‘package:lubridate’:
hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year
The following objects are masked from ‘package:dplyr’:
between, first, last
The following object is masked from ‘package:purrr’:
transpose
library(lubridate)
# reading the data
dataset <- fromJSON(file="datasets.json") # main dataset file
data <- bind_rows(dataset) # json2df
datastat <- read.delim("views_downloads.tsv", header=TRUE, sep="\t", dec=".") # views/downloads stats file
groups <- fromJSON(file="groups.json") # name mapping file (from github @ 4TUResearchData/djehuty)
groups <- bind_rows(groups) # json2df
# keeping unique rows only, very crude way to filter data quickly
data.1 <- data %>% distinct(uuid, .keep_all = TRUE) # filter unique 'uuid'
datastat.1 <- datastat %>% distinct(uuid, .keep_all = TRUE)
data.all <- merge(data.1, datastat.1, by="uuid") # adding view/download stats
colnames(groups)[colnames(groups) == "id"] ="group_id"
data.all.label <- merge(data.all, groups, by="group_id") # adding university labels
data.all.label$date <- ymd_hms(data.all.label$published_date) # fixing datetime format
# select-data with required (useful) columns only
data.select <- data.all.label %>% select(
group_id,
#uuid,
#title,
#published_date,
date,
views,
downloads,
#name
)
# grouping based on university
data.group <- data.select %>% group_by(group_id)
# calculating different stats for different plot ideas
data.group.stat <- data.group %>% summarise(
view_x = mean(views),
view_m = median(views),
view_std = sd(views, na.rm=TRUE),
download_x = mean(downloads),
download_m = median(downloads),
download_std = sd(downloads, na.rm=TRUE)
)
data.group.stat
data.group.quant.view <- as.data.frame(do.call("rbind",tapply(data.group$views, data.group$group_id, quantile)))
colnames(data.group.quant.view) <- c( "v0", "v25", "v50", "v75", "v100")
data.group.quant.view <- rownames_to_column(data.group.quant.view, "group_id")
data.group.quant.download <- as.data.frame(do.call("rbind",tapply(data.group$downloads, data.group$group_id, quantile)))
colnames(data.group.quant.download) <- c("d0", "d25", "d50", "d75", "d100")
data.group.quant.download <- rownames_to_column(data.group.quant.download, "group_id")
data.group.quant <- merge(data.group.quant.view, data.group.quant.download)
data.group.quant
data.group.quant <- merge(data.group.quant, groups)
data.group.quant$name_id <- c("4TU","TU/D","TU/e","UT","WUR","OI","TU/D (s)","TU/e (s)","UT (s)","NIOZ","LU","UU","EUR","D","RU","RUG","MU") # short-key for label
data.group.quant
NA
# testing different plot ideas
p1 <- boxplot(views ~ group_id, data=data.group) # views distribution
p2 <- boxplot(downloads ~ group_id, data=data.group) # downloads distribution
p1
p2
p3 <- ggplot() +
geom_point(data = data.group.stat,
aes(x=view_m, y=download_m)
) # downloads vs views (with median)
p3
p4 <- ggplot() +
geom_point(data = data.group.stat,
aes(x=view_x, y=download_x)
) # downloads vs views (with mean)
p4
p5 <- ggplot() +
geom_point(data = data.group.stat,
aes(x=view_std, y=download_std)
) # downloads vs views (with standard deviation)
p5
# idea | cross-box -as- xy-err w/ quants
# plotting 1st-3rd quantiles (as error-bars) of downloads as-a-function-of views around median (i.e 2nd quantile) > to extract trends across universities
p6 <- png(file="4TU_usage_plot_v0.png",width=1200,height=500,res=78)
p6 <- par(mar = c(11.5, 5, 2, 3))
p6 <- plot(data.group.quant$v50, data.group.quant$d50,
main="Institute-wise 4TU.ResearchData usage metric",
xlab="Views", ylab="Downloads",
pch=1, cex=1.5, lty="solid", lwd=2,
xlim = c(0, max(data.group.quant$v50)*1.1),
ylim = c(0, max(data.group.quant$d50)*1.1),
) # scatter of median
wrap_strings <- function(vector_of_strings,width){as.character(sapply(vector_of_strings,FUN=function(x){paste(strwrap(x,width=width), collapse="\n")}))}
p6 <- graphics::title(sub=wrap_strings("Usage statistics of Institute's 4TU research data repositories: Dot represent median value (2nd quartile), while flat-arrows represent inter-quartile range (IQR) (1st-3rd quartile) for Institute's repositories views and downloads \n [Key: 4TU - 4TU.ResearchData, TU/D - Delft University of Technology, TU/e - Eindhoven University of Technology, UT - University of Twente, WUR - Wageningen University and Research, OI - Other institutions, TU/D (s) - Delft University of Technology Students, TU/e (s) - Eindhoven University of Technology Students, UT (s) - University of Twente Students, NIOZ - NIOZ Royal Netherlands Institute for Sea Research, LU - Leiden University, UU - Utrecht University, UA - University of Amsterdam, EUR - Erasmus University Rotterdam, D - Deltares, IHE - IHE Delft Institute for Water Education, RU - Radboud University, RUG - University of Groningen, MU - Maastricht University, UA (s) - University of Amsterdam Students]", 185), line=9.75)
p6 <- arrows(x0=data.group.quant$v50, y0=data.group.quant$d25,
x1=data.group.quant$v50, y1=data.group.quant$d75,
code=3, angle=90, length=0.1, col="gray") # quantiles of downloads
p6 <- arrows(x0=data.group.quant$v25, y0=data.group.quant$d50,
x1=data.group.quant$v75, y1=data.group.quant$d50,
code=3, angle=90, length=0.1, col="gray") # quantiles of views
p6 <- text(data.group.quant$v50, data.group.quant$d50, labels=data.group.quant$name_id, cex= 1, pos=4) # adding short-label/key
p6
dev.off()