Jared Moore · 22bb4068
--- a/src/julia/hash_genome.jl

+ 46

− 4
+++ b/src/julia/hash_genome.jl

+ 46

− 4
 const WINDOW_SIZE = 25
+# this is a made up value
+const MIN_SEQUENCE_SIMILARITY = .60

 include("parsing.jl")

 @@ -105,7 +107,7 @@ function sequence_voting(sequences, reads, low, high)
    merged = MergedRead(Array(combined_nucleotide, end_index),
                        Array(Array{UInt16}, end_index))
    for i in 1:end_index
-        # Using zeros in order to force initialization        
+        # Using zeros in order to force initialization
 	merged.counts[i] = zeros(UInt16, length(NUCLEOTIDE_CHARS))
        for hash_pair in sequences
            # We use high here because it is the first hash to appear
 @@ -131,13 +133,32 @@ function sequence_voting(sequences, reads, low, high)
    return merged
 end

+function is_similar_enough(sequence_counts)
+    # how should we determine if the merge is good enough?
+    # a certain percent similarity at each position?
+    # an average percent similarity?
+    sum_sim = 0.0
+    total_counts = 0.0
+    for counts in sequence_counts
+        nuc_sim = 1.0
+        max_index = indmax(counts)
+        # make sure this is float divide
+        counts_sum = sum(counts)
+        total_counts += counts_sum
+        nuc_sim = counts[max_index] / counts_sum
+        sum_sim += (nuc_sim * counts_sum)
+    end
+    sim = (sum_sim / total_counts) #/ length(sequence_counts)
+    return sim > MIN_SEQUENCE_SIMILARITY
+end
+
 # assumes arrays are same length. computes the sum for each position into b
 function combine_arrays(a, b)
    @assert (length(a) == length(b))
    for i in 1:length(a)
        b[i] += a[i]
     end
-end    
+end

 # places the read_number associated with read in bucket with the
 # closest hash to hash in hash_to_reads
 @@ -158,11 +179,33 @@ function merge_reads(hash_to_reads, cur_reads)
        bucket = hash_to_reads[k]
        low, high = find_bounding_reads(bucket, cur_reads)
        merged_read = sequence_voting(bucket, cur_reads, low, high)
-        push!(newreads, merged_read)
+        if (is_similar_enough(merged_read.counts))
+            push!(newreads, merged_read)
+        else
+            # add all of the reads that were in a bucket because they were
+            # not significant enough to merge
+            for hash_pair in bucket
+                cur_read = cur_reads[hash_pair.read_index + 1]
+                mr = isa(cur_read, MergedRead) ? cur_read : to_merged_read(cur_read)
+                push!(newreads, mr)
+            end
+         end
    end
    return newreads
 end

+function to_merged_read(cur_read::Array{combined_nucleotide})
+    mr = MergedRead(Array(combined_nucleotide, length(cur_read)),
+                    Array(Array{UInt16}, length(cur_read)))
+    for i in 1:length(cur_read)
+        mr.counts[i] = zeros(UInt16, length(NUCLEOTIDE_CHARS))
+        mr.sequence[i] = cur_read[i]
+        nuc_value = combinedToNucleotide(cur_read[i])
+        mr.counts[i][nuc_value + 1] += 1
+    end
+    return mr
+end    
+
 function print_round_info(cur_reads, debug)
    if debug
        println(length(cur_reads))
 @@ -192,4 +235,3 @@ function hash_genome(reads, read_length, debug)
        print_round_info(cur_reads, debug)
    end
 end
-