Code snippets
Database credentials
Hostname: cayoinstance.c9wcxkprzjou.us-east-1.rds.amazonaws.com
Database name: cayodatabase
Username: cayoread
Password: See the Slides.
Port: 5432
0. Establishing connection
This code loads the required libraries and establishes connection to the database. It needs to be run once on top of any file which fetches data from the database.
R
library(RPostgres)
library(DBI)
con = dbConnect(RPostgres::Postgres(),
host='cayoinstance.c9wcxkprzjou.us-east-1.rds.amazonaws.com',
dbname='cayodatabase',
user='cayoread',
password='')
Python
import psycopg2
conn = psycopg2.connect(
host='cayoinstance.c9wcxkprzjou.us-east-1.rds.amazonaws.com',
dbname='cayodatabase',
user='cayoread',
password='',
)
cur = conn.cursor()
1. Get all individuals
SQL
SELECT * FROM public.individuals
R
query = dbSendQuery(con, 'SELECT * FROM public.individuals')
result = dbFetch(query) # result is now in a dataframe
Python
cur.execute("SELECT * from public.individuals")
result = cur.fetchall() # result is list of tuples
2. Get sample type distribution in the biosamples table.
SQL
select
b."biosample_type1",
b."biosample_type2",
count(*) as "counts"
from
public.biosamples b
group by
b."biosample_type1",
b."biosample_type2"
order by
b."biosample_type1",
"counts" desc;
R
query = dbSendQuery(con, 'SELECT * FROM public.biosamples')
biosamples = dbFetch(query)
result <- biosamples %>%
group_by(biosample_type1, biosample_type2) %>%
summarise(counts = n()) %>%
arrange(biosample_type1, desc(counts))
3. Get all biosamples of type brain and combine it with individual's information
SQL
SELECT
"public"."biosamples"."biosample_type1",
"public"."biosamples"."biosample_type2",
"Processings"."individual_id",
"Processings"."processing_timestamp",
"Processings"."processing_type",
"Processings"."body_weight_kgs",
"Processings"."processing_code",
"Processings"."group_id",
"Individuals"."individual_code",
"Individuals"."individual_sex",
"Individuals"."infant_code"
FROM
"public"."biosamples"
LEFT JOIN "public"."processings" AS "Processings" ON "public"."biosamples"."processing_id" = "Processings"."processing_id"
LEFT JOIN "public"."individuals" AS "Individuals" ON "Processings"."individual_id" = "Individuals"."individual_id"
where biosample_type1 = 'brain';
R
query = dbSendQuery(con, 'SELECT * FROM public.biosamples')
biosamples = dbFetch(query)
query = dbSendQuery(con, 'SELECT * FROM public.processings')
processings = dbFetch(query)
query = dbSendQuery(con, 'SELECT * FROM public.individuals')
individuals = dbFetch(query)
result <- biosamples %>%
left_join(processings, by = c("processing_id" = "processing_id")) %>%
filter(biosample_type1 == 'brain') %>%
select(biosample_type1, biosample_type2, individual_id.x, processing_timestamp, processing_type,
body_weight_kgs, processing_code, group_id)
result2 <- result %>%
left_join(individuals, by = c("individual_id.x" = "individual_id"))
4. Date of births for all individuals and their group name at birth
SQL
SELECT
"public"."individual_life_histories"."individual_life_history_id",
"public"."individual_life_histories"."individual_id",
"public"."individual_life_histories"."date",
"public"."individual_life_histories"."life_history_event",
"public"."individual_life_histories"."group_id",
"Individuals"."individual_code",
"Individuals"."individual_sex",
"Individuals"."infant_code",
"Groups"."group_name"
FROM
"public"."individual_life_histories"
LEFT JOIN "public"."individuals" AS "Individuals" ON "public"."individual_life_histories"."individual_id" = "Individuals"."individual_id"
LEFT JOIN "public"."groups" AS "Groups" ON "public"."individual_life_histories"."group_id" = "Groups"."group_id"
WHERE
"public"."individual_life_histories"."life_history_event" = 'birth';
5. For a given list of monkeys, find all recorded body weights
SQL
select
i.individual_id,
p.body_weight_kgs,
p.capture_timestamp,
p.processing_timestamp
from
individuals i
left join processings p on
p.individual_id = i.individual_id
where
i.individual_id in(3202, 3150, 3499, 3658, 3422)
and p.body_weight_kgs is not null;
R
query = dbSendQuery(con, 'SELECT * FROM public.processings')
processings = dbFetch(query)
query = dbSendQuery(con, 'SELECT * FROM public.individuals')
individuals = dbFetch(query)
filter_list = c(3202, 3150, 3499, 3658, 3422) # custom filter list
result <- individuals %>%
left_join(processings, by = "individual_id") %>%
filter(individual_id %in% filter_list &
!is.na(body_weight_kgs)) %>%
select(individual_id, body_weight_kgs, capture_timestamp, processing_timestamp)
5. Given a list of monkeys, find the total duration they were observed
This example fetches 1000 random individuals from the list
SQL
select
i.individual_id,
MIN(bd.behavior_data_time) as earliest_behavior_entry,
MAX(bd.behavior_data_time) as latest_behavior_entry,
(MAX(bd.behavior_data_time) - MIN(bd.behavior_data_time)) as total_days,
COUNT(bd.behavior_data_id) as total_behavior_counts
from
individuals i
inner join behavior_samples bs on
bs.individual_id1 = i.individual_id
inner join behavior_data bd on
bs.behavior_sample_id = bd.behavior_sample_id
where
individual_id in (
select
individual_id
from
individuals i
order by
RANDOM()
limit 1000
)
group by
i.individual_id;