######################################
# Connect to server
######################################
# Windows users: download putty from internet
# Putty configuration: IP address 129.24.144.224 pw- test

# Mac users: open terminal
	> ssh sbuddenb@129.24.144.224
		pw: test

######################################
# File viewing and manipulation
######################################
# Look at the beginning of the file to see what it looks like
	> more 454_testreads.fa

# Search for a specific sequence and return 30 lines after
	> grep HJ4YRIA01CSMNU -A 30 454_testreads.fa

# How could you remove the header?
	> sed 's/HJ4YRIA.*//' 454_testreads.fa > 454_testreads_seqonly

# How could you remove parts of the header?
	> sed 's/rank.*$//g' 454_testreads.fa > 454_testreads_shortheader

# How could you change parts of the header if you want?
	> perl -pi -w -e 's/SEARCH_FOR/REPLACE_WITH/g;' *.txt
	ex/ > perl -pi -e 's/HJ4YRIA/SarahisSweet/g;' *.txt

# How many individual reads are there?
	> grep -c '^>' 454_testreads.fa

# What is the average sequence length?
	> grep -v '^>' 454_testreads.fa | wc --chars


######################################
# SEQCLEAN: filter raw reads 
######################################
# Seqclean uses Megablast so we need to make a blastable database of our genome on the server

# Go to location of BLAST program
	> cd /teraraid23/sarah/blast

# Using blast+
	> makeblastdb -in testjunk.fa -input_type fasta -dbtype nucl -out testjunk.fa
	
# Run seqclean
	> ./seqclean 454_testreads.fa -c 1 -s /teraraid23/sarah/blast/testjunk.fa

# Rename output file
	> mv 454_testreads.fa.clean 454_testreads_filtered.fa


######################################
# GMAP: transcripts x genes/genomes
######################################
# Build database for Capsaspora genome
	> ./gmap_build -d /teraraid23/sarah/bin/CapsasporaGenome_short.fa /teraraid23/sarah/bin/CapsasporaGenome_short.fa

# Align 454_testreads_filtered.fa x CapsasporaGenome_short.fa, then look at the resulting file
	> ./gmap -D /teraraid23/sarah/bin/ -d CapsasporaGenome_short.fa /teraraid23/sarah/seqclean/seqclean-x86_64/454_testreads_filtered.fa --npaths=1 -Z > 454_testreads_filtered_xCapsasporaGenome 2>stderr.txt

#GMAP gives you all matches, good and bad so we need to choose a cutoff based on %identity and %coverage		
	> perl perl_forZ_gmap.pl 454_testreads_filtered_xCapsasporaGenome 95 95


######################################
# BLASTn: transcripts x transcripts
######################################
# Make blastable database of snail_ESTs.fa
	> makeblastdb -in snail_ESTs.fa -input_type fasta -dbtype nucl -parse_seqids -out snail_ESTs.fa

# Align 454_testreads_filtered.fa x snail_ESTs.fa (snaildb, genbank ESTs, vectorbase)
	> blastn -db /teraraid23/sarah/blast/snail_ESTs.fa -query /teraraid23/sarah/seqclean/seqclean-x86_64/454_testreads_filtered.fa -task blastn -outfmt 6 -evalue 1e-50 -max_target_seqs 1 -num_threads 16 -out 454_filterd_xsnailESTs_e-50



https://main.g2.bx.psu.edu/