In [1]:
import pandas as pd
import plotly.express as px
from datetime import datetime

In [2]:
metadata_path = '../../config/metadata.tsv'
bed_targets_path = '../../config/ag-vampir.bed'
wkdir = '../..'
dataset= "my_dataset_name"
panel= 'ag-vampir'
cohort_cols = 'taxon,location'
config_path = '../../config/config.yaml'

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("max_colwidth", None)

In [3]:
# Parameters
config_path = "/home/snagi/lstm_projects/ampseq-agvampir002/config/config_ms.yaml"
metadata_path = "config/metadata_ms.tsv"
bed_targets_path = "config/ag-vampir.bed"
panel = "ag-vampir"
dataset = "ag-vampir-002"
cohort_cols = "location,taxon"
wkdir = "/home/snagi/lstm_projects/ampseq-agvampir002"


# Run information

In [4]:
time = datetime.today().strftime('%Y-%m-%d %H:%M:%S')

print(f"Dataset: {dataset}\nPanel: {panel}\nCohort columns: {cohort_cols}\n\nExecution time: {time}")

Dataset: ag-vampir-002
Panel: ag-vampir
Cohort columns: location,taxon

Execution time: 2024-12-03 22:44:50


#### Reference genome

In [5]:
import yaml 
with open(config_path, 'r') as file:
    config_yml = yaml.safe_load(file)

ref_dict = {k:v for k,v in config_yml.items() if 'reference' in k}

ref_df = pd.DataFrame.from_dict(ref_dict, orient='index').reset_index()
ref_df.columns = ['reference', 'value']
ref_df

Unnamed: 0,reference,value
0,reference-name,AgamP4
1,reference-fasta,resources/reference/Anopheles-gambiae-PEST_CHROMOSOMES_AgamP4.fa
2,reference-gff3,resources/reference/Anopheles-gambiae-PEST_BASEFEATURES_AgamP4.12.gff3
3,reference-snpeffdb,Anopheles_gambiae


#### Panel information

In [6]:
panel_metadata = pd.read_csv(bed_targets_path, sep="\t", header=None)
panel_metadata.columns = ['contig', 'start', 'end', 'amplicon', 'mutation', 'ref', 'alt']
contigs = panel_metadata.contig.unique()
panel_metadata

Unnamed: 0,contig,start,end,amplicon,mutation,ref,alt
0,2L,209535,209536,Agam_1,AIM1,A,G
1,2L,927246,927247,Agam_2,AIM2,C,A
2,2L,1274352,1274353,Agam_3,AIM3,G,A
3,2L,1418209,1418210,Agam_4,AIM4,T,C
4,2L,1571928,1571929,Agam_5,AIM5,T,C
5,2L,1776347,1776348,Agam_6,AIM6,G,C
6,2L,1947573,1947574,Agam_7,AIM7,G,A
7,2L,2380981,2380982,Agam_8,Vgsc_tag1,T,C
8,2L,2381705,2381706,Agam_9,Vgsc_tag2,A,G
9,2L,2388594,2388595,Agam_10,Vgsc_tag3,G,T


#### Input metadata

In [7]:
# load panel metadata
if metadata_path.endswith('.xlsx'):
    metadata = pd.read_excel(metadata_path, engine='openpyxl')
elif metadata_path.endswith('.tsv'):
    metadata = pd.read_csv(metadata_path, sep="\t")
elif metadata_path.endswith('.csv'):
    metadata = pd.read_csv(metadata_path, sep=",")
else:
    raise ValueError("Metadata file must be .xlsx or .csv")

metadata    

Unnamed: 0,sample_id,taxon,location,country,plate,well_letter,well_number,latitude,longitude
0,GH_01,,Obuasi,Ghana,3,A,1,,
1,GH_02,,Obuasi,Ghana,3,A,2,,
2,GH_03,,Obuasi,Ghana,3,A,3,,
3,GH_04,,Obuasi,Ghana,3,A,4,,
4,GH_05,,Obuasi,Ghana,3,A,5,,
5,GH_06,,Obuasi,Ghana,3,A,6,,
6,GH_07,,Obuasi,Ghana,3,A,7,,
7,GH_08,,Obuasi,Ghana,3,A,8,,
8,GH_09,,Obuasi,Ghana,3,A,9,,
9,GH_10,,Obuasi,Ghana,3,A,10,,
