Skip to content
Snippets Groups Projects

Similarity

Merged Jared Moore requested to merge similarity into master
+ 46
4
Compare changes
  • Side-by-side
  • Inline
+ 46
4
const WINDOW_SIZE = 25
# this is a made up value
const MIN_SEQUENCE_SIMILARITY = .60
include("parsing.jl")
@@ -105,7 +107,7 @@ function sequence_voting(sequences, reads, low, high)
merged = MergedRead(Array(combined_nucleotide, end_index),
Array(Array{UInt16}, end_index))
for i in 1:end_index
# Using zeros in order to force initialization
# Using zeros in order to force initialization
merged.counts[i] = zeros(UInt16, length(NUCLEOTIDE_CHARS))
for hash_pair in sequences
# We use high here because it is the first hash to appear
@@ -131,13 +133,32 @@ function sequence_voting(sequences, reads, low, high)
return merged
end
function is_similar_enough(sequence_counts)
# how should we determine if the merge is good enough?
# a certain percent similarity at each position?
# an average percent similarity?
sum_sim = 0.0
total_counts = 0.0
for counts in sequence_counts
nuc_sim = 1.0
max_index = indmax(counts)
# make sure this is float divide
counts_sum = sum(counts)
total_counts += counts_sum
nuc_sim = counts[max_index] / counts_sum
sum_sim += (nuc_sim * counts_sum)
end
sim = (sum_sim / total_counts) #/ length(sequence_counts)
return sim > MIN_SEQUENCE_SIMILARITY
end
# assumes arrays are same length. computes the sum for each position into b
function combine_arrays(a, b)
@assert (length(a) == length(b))
for i in 1:length(a)
b[i] += a[i]
end
end
end
# places the read_number associated with read in bucket with the
# closest hash to hash in hash_to_reads
@@ -158,11 +179,33 @@ function merge_reads(hash_to_reads, cur_reads)
bucket = hash_to_reads[k]
low, high = find_bounding_reads(bucket, cur_reads)
merged_read = sequence_voting(bucket, cur_reads, low, high)
push!(newreads, merged_read)
if (is_similar_enough(merged_read.counts))
push!(newreads, merged_read)
else
# add all of the reads that were in a bucket because they were
# not significant enough to merge
for hash_pair in bucket
cur_read = cur_reads[hash_pair.read_index + 1]
mr = isa(cur_read, MergedRead) ? cur_read : to_merged_read(cur_read)
push!(newreads, mr)
end
end
end
return newreads
end
function to_merged_read(cur_read::Array{combined_nucleotide})
mr = MergedRead(Array(combined_nucleotide, length(cur_read)),
Array(Array{UInt16}, length(cur_read)))
for i in 1:length(cur_read)
mr.counts[i] = zeros(UInt16, length(NUCLEOTIDE_CHARS))
mr.sequence[i] = cur_read[i]
nuc_value = combinedToNucleotide(cur_read[i])
mr.counts[i][nuc_value + 1] += 1
end
return mr
end
function print_round_info(cur_reads, debug)
if debug
println(length(cur_reads))
@@ -192,4 +235,3 @@ function hash_genome(reads, read_length, debug)
print_round_info(cur_reads, debug)
end
end
Loading