from IPython.core.display import HTML

def _set_css_style(css_file_path):
   """
   Read the custom CSS file and load it into Jupyter.
   Pass the file path to the CSS file.
   """

   styles = open(css_file_path, "r").read()
   s = '<style>%s</style>' % styles     
   return HTML(s)

_set_css_style('rise.css')

from Bio.Seq import Seq # the submodule structure of biopython is a little awkward

s = Seq('GATTACA')
s

s[0]

s[2:4] # returns sequence

s.lower()

s + s

dna = Seq('GATTACAGATTACAGATTACA')
dna.complement(),dna.reverse_complement()

dna

rna = dna.transcribe()
rna

protein = rna.translate()
protein

dna.translate() # unlike cells, don't actually need rna

from Bio.Data import CodonTable
print(sorted(CodonTable.unambiguous_dna_by_name.keys()))

print(CodonTable.unambiguous_dna_by_name['Standard'])

from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
seq = SeqIO.read('../files/p53.gb','genbank')
seq

seqs = []
# https://MSCBIO2025-2024.github.io/files/hydra.fasta
for s in SeqIO.parse('../files/hydra.fasta','fasta'):
    seqs.append(s)

len(seqs)

from Bio import Entrez
Entrez.email = 'jpbarton@pitt.edu' # biopython forces you to provide your email
res = Entrez.read(Entrez.einfo())  # the names of all available databases
res

print(sorted(res['DbList']))

result = Entrez.esearch(db='nucleotide', term='tp53')  # the result is a file-like object of the raw xml data
records = Entrez.read(result) # put into a more palatable form (dictionary)
print(records)

records = Entrez.read(Entrez.esearch(db='nucleotide', term='tp53', retmax=50))
records

#fetch the genbank file for the first id from our search
result = Entrez.efetch(db="nucleotide",id=records['IdList'][0],rettype="gb",retmode='text')
#parse into a seqrecord
p53 = SeqIO.read(result,'gb')

result

p53

p53.features

cdsfeature = p53.features[3]
print(cdsfeature)

coding = cdsfeature.extract(p53) #pass the full record (p53) to the feature
coding

from Bio.Blast import NCBIWWW
result = NCBIWWW.qblast('blastn','nt',coding.seq,hitlist_size=250)
# result is a file-like object with xml in it - it can take a while to get results

from Bio.Blast import NCBIXML #for parsing xmls
blast_records = NCBIXML.read(result)

print(len(blast_records.alignments),len(blast_records.descriptions))

alignment = blast_records.alignments[0]
print(len(alignment.hsps))

hsp = alignment.hsps[0] # high scoring segment pairs
print('****Alignment****')
print('sequence:', alignment.title)
print('length:', alignment.length)
print('e value:', hsp.expect)
print(hsp.query[0:75] + '...')  # what we searched with
print(hsp.match[0:75] + '...')
print(hsp.sbjct[0:75] + '...')  # what we matched to

alignment = blast_records.alignments[-1] # get last alignment
hsp = alignment.hsps[0]
print('****Alignment****')
print('sequence:', alignment.title)
print('length:', alignment.length)
print('e value:', hsp.expect)
print(hsp.query[0:75] + '...')  # what we searched with
print(hsp.match[0:75] + '...')
print(hsp.sbjct[0:75] + '...')  # what we matched to

from Bio import AlignIO
align = AlignIO.read('../files/hydra179.aln','clustal')

align

print(align)

align[0] # first row

align[:,0] # first column

print(align[:,0:10])

Sequence analysis and biopython¶

Sequence data¶

FASTA¶

Genbank¶

Biopython¶

Sequence Objects¶

Accessing Seq data¶

The Central Dogma¶

The Central Dogma¶

Codon Tables¶

SeqRecord¶

Fetching sequences from the Internet¶

ESearch¶

EFetch¶

Features¶

Extracting subsequences¶

BLAST¶

Alignments¶

Slicing Alignments¶

And now for a brief foray into marine microbiology...¶

Project¶

For next time¶