From 5cfb74fae40e55a7315a3ca6665676faccb3890b Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Tue, 4 Mar 2025 12:10:09 -0500
Subject: [PATCH 1/6] add fib problem

---
 docs/src/rosalind/04-fib.md                   | 93 +++++++++++++++++++
 .../rosalind/problem_inputs/rosalind_fib.txt  |  1 +
 2 files changed, 94 insertions(+)
 create mode 100644 docs/src/rosalind/04-fib.md
 create mode 100644 docs/src/rosalind/problem_inputs/rosalind_fib.txt

diff --git a/docs/src/rosalind/04-fib.md b/docs/src/rosalind/04-fib.md
new file mode 100644
index 0000000..d079351
--- /dev/null
+++ b/docs/src/rosalind/04-fib.md
@@ -0,0 +1,93 @@
+# ♻️ 🐇 Rabbits and Recurrence Relations
+
+[Original Problem](https://rosalind.info/problems/fib/)
+
+!!! warning "The Problem"
+    _Given_: Positive integers ``n≤40``
+    and $k≤5$.
+
+    _Return_: The total number of rabbit pairs
+    that will be present after ``n`` months,
+    if we begin with 1 pair and in each generation,
+    every pair of reproduction-age rabbits produces a litter of ``k``
+    rabbit pairs (instead of only 1 pair).
+
+    Sample Dataset
+    ```txt
+    5 3
+    ```
+    Sample Output
+    ```
+    19
+    ```
+
+## Recursion in julia
+
+This is a classic computer science problem,
+and performing recursion in julia is pretty easy -
+we simply have a function call itself.
+
+The key is to have a bail-out condition
+to avoid infinite recursion.
+
+In this case, the bail-out is when you reach ``1``,
+and it's also helpful to avoid invalid inputs.
+
+```@example fib
+function fib(n::Int, k::Int)
+    # validate inputs
+    n >= 0 || error("N must be greater than or equal to 0")
+    k > 1 || error("K must be at least 2")
+
+    # once we reach 1 or 0, we just return 1 or 0
+    if n <= 1
+        return n
+    else
+        # otherwise, recursively call on the previous 2 integers
+        return fib(n - 1, k) + k * fib(n - 2, k)
+    end
+end
+```
+
+Let's go through each piece:
+
+## Multiple dispatch
+
+```julia
+function fib(n::Int, k::Int)
+```
+
+Here, we're defining a function with 2 arguments,
+``n`` and ``k``.
+The `::Int` syntax forces the arguments to have the type `Int`,
+which is short-hand for a 64-bit integer.
+
+As we saw in previous problems,
+julia makes use of "multiple dispatch",
+which means we can define the same function with multiple "methods",
+each of which takes different types of arguments.
+
+In this case,
+if we try to call the function with different arguments,
+we'll get a `MethodError`, telling us that there isn't
+a version that works on that combination of types:
+
+
+```julia-repl
+julia> fib("5", "3")
+ERROR: MethodError: no method matching fib(::String, ::String)
+The function `fib` exists, but no method is defined for this combination of argument types.
+
+Closest candidates are:
+  fib(::Int64, ::Int64)
+```
+
+If we'd like, we could define a version
+that takes `String`s as arguments,
+and tries to convert them into integers.
+To do that, we use the `parse()` function.
+
+```@example fib
+parse(Int, "5")
+```
+
diff --git a/docs/src/rosalind/problem_inputs/rosalind_fib.txt b/docs/src/rosalind/problem_inputs/rosalind_fib.txt
new file mode 100644
index 0000000..92d318a
--- /dev/null
+++ b/docs/src/rosalind/problem_inputs/rosalind_fib.txt
@@ -0,0 +1 @@
+31 3

From ab25b93e3b6e15048ec80e8223e2ac52ebabcf35 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Mon, 10 Mar 2025 14:19:28 -0400
Subject: [PATCH 2/6] finish up fibonacci

---
 docs/src/rosalind/01-dna.md |   2 +-
 docs/src/rosalind/04-fib.md | 208 ++++++++++++++++++++++++++++++++++--
 2 files changed, 198 insertions(+), 12 deletions(-)

diff --git a/docs/src/rosalind/01-dna.md b/docs/src/rosalind/01-dna.md
index 712bf9e..b47ac5a 100644
--- a/docs/src/rosalind/01-dna.md
+++ b/docs/src/rosalind/01-dna.md
@@ -1,5 +1,5 @@
 
-##  🧬 Problem 1: Counting DNA nucleotides
+#  🧬 Problem 1: Counting DNA nucleotides
 
 🤔 [Problem link](https://rosalind.info/problems/dna/)
 
diff --git a/docs/src/rosalind/04-fib.md b/docs/src/rosalind/04-fib.md
index d079351..66039e0 100644
--- a/docs/src/rosalind/04-fib.md
+++ b/docs/src/rosalind/04-fib.md
@@ -21,13 +21,15 @@
     19
     ```
 
+This is a classic computer science problem,
+but not _directly_ biology focused.
+Instead, we'll use it to showcase some other julia features.
+
 ## Recursion in julia
 
-This is a classic computer science problem,
-and performing recursion in julia is pretty easy -
+Recursion in julia is pretty easy -
 we simply have a function call itself.
-
-The key is to have a bail-out condition
+As in any language, the key is to have a bail-out condition
 to avoid infinite recursion.
 
 In this case, the bail-out is when you reach ``1``,
@@ -51,8 +53,67 @@ end
 
 Let's go through each piece:
 
+```julia
+function fib(n::Int, k::Int)
+```
+
+is the function definition.
+It takes two arguments, `n` and `k`, both of type `Int`.
+
+```julia
+n >= 0 || error("N must be greater than or equal to 0")
+k > 1 || error("K must be at least 2")
+```
+
+are what we call "short-circuit" evaluation.
+The `||` operator is a logical OR operator,
+and so if the first condition is true, the second condition is not evaluated
+(because `true` OR anything is true).
+The same things could have been written with the short-circuiting `&&`
+or as an `if` statement:
+
+```julia
+!(n >= 0) && error("N must be greater than or equal to 0")
+# or
+n < 0 && error("N must be greater than or equal to 0")
+# or
+if n < 0
+    error("N must be greater than or equal to 0")
+end
+```
+
+```julia
+if n <= 1
+    return n
+```
+
+This is our recursion bail-out or "base case".
+Once we reach ``1`` or ``0``, we don't want to recurse any more,
+we just return that value.
+
+```julia
+else
+    return fib(n - 1, k) + k * fib(n - 2, k)
+end
+```
+
+If we're not at the base case, we recurse.
+We call the function again, but with ``n - 1``
+for the previous generation,
+and ``n - 2`` for the generation before that.
+We also multiply the second call by ``k``
+to handle the fact that each pair of rabbits
+from two generations ago (the ones that have matured)
+produce ``k`` pairs of rabbits when they breed.
+
 ## Multiple dispatch
 
+So that solves the recursion problem,
+but one thing you might notice
+if you try to read the inputs from rosalind.info directly
+is that we read the files as `String`s,
+but we've defined our function to operate on `Int`s.
+
 ```julia
 function fib(n::Int, k::Int)
 ```
@@ -62,17 +123,35 @@ Here, we're defining a function with 2 arguments,
 The `::Int` syntax forces the arguments to have the type `Int`,
 which is short-hand for a 64-bit integer.
 
-As we saw in previous problems,
-julia makes use of "multiple dispatch",
-which means we can define the same function with multiple "methods",
-each of which takes different types of arguments.
+In something like python,
+you would probably write the function without type annotations,
+and then check inside the function
+with an `if` statement to deal with
+the case where the arguments are not integers:
+
+```python
+def fib(n, k):
+    if not isinstance(n, int):
+        n = int(n)
+    if not isinstance(k, str):
+        k = int(k)
+
+    # do stuff
+```
+
+In julia, we can handle this instead by
+defining different "methods" of `fib`, each of which
+takes different types of arguments.
+In a previous problem,
+when we defined a function to operate on `String`s or on `BioSequence`s,
+we saw that julia makes use of "multiple dispatch",
+though we didn't name it as such.
 
 In this case,
-if we try to call the function with different arguments,
-we'll get a `MethodError`, telling us that there isn't
+if we try to call the function with different argument types,
+say `String`s, we'll get a `MethodError`, telling us that there isn't
 a version that works on that combination of types:
 
-
 ```julia-repl
 julia> fib("5", "3")
 ERROR: MethodError: no method matching fib(::String, ::String)
@@ -86,8 +165,115 @@ If we'd like, we could define a version
 that takes `String`s as arguments,
 and tries to convert them into integers.
 To do that, we use the `parse()` function.
+For example:
 
 ```@example fib
 parse(Int, "5")
 ```
 
+So, here's a version of `fib` that takes `String`s as arguments:
+
+```@example fib
+function fib(n::String, k::String)
+    nint = parse(Int, n)
+    kint = parse(Int, k)
+    fib(nint, kint)
+end
+
+fib("5", "3")
+```
+
+Ok, but this still doesn't work:
+
+```julia-repl
+julia> fib("2", 3)
+ERROR: MethodError: no method matching fib(::String, ::Int64)
+The function `fib` exists, but no method is defined for this combination of argument types.
+
+Closest candidates are:
+  fib(::String, ::String)
+   @ Main REPL[2]:1
+  fib(::Int64, ::Int64)
+   @ Main REPL[1]:1
+```
+
+We could now go through and define methods
+of `fib` that take different types of arguments.
+For example:
+
+```@example fib
+function fib(n::String, k::Int)
+    nint = parse(Int, n)
+    fib(nint, k)
+end
+```
+
+or
+
+```@example fib
+function fib(n::Int, k::String)
+    kint = parse(Int, k)
+    fib(n, kint)
+end
+```
+
+Now, we can call `fib` with an `Int` and a `String`:
+
+```@example fib
+fib(2, "3")
+```
+
+## Parsing files
+
+When you do a problem on Rosalind,
+the file you download contains your problem input.
+Up to now,
+I've just assumed you'll be copy-pasting
+the input into your Julia REPL.
+
+However, it's often more convenient to read the input from a file,
+and in this case, a little additional parsing is required,
+since the input comes in the form of two integers separated by a space,
+and when you read in the file, it will be a `String`.
+
+There are a number of ways to read files in julia,
+including the `readlines()` function,
+which loads all of the lines of the file into `Vector{String}`.
+
+When you have really large files, this can be annoying,
+and you can instead use the `eachline()` function,
+so you can do something like `for line in readlines("input.txt")`
+and deal with each line separately.
+
+But in this case (and the previous rosalind.info problems we've looked at),
+each problem comes in as a single line,
+so we'll use the `read()` function instead.
+By default, `read()` reads the entire file into a vector of bytes,
+that is a `Vector{UInt8}`.
+This can be converted to a `String` using the `String()` function,
+but this is a common enough use-case that we can just tell `read()` to return a `String`
+by passing the `String` type as an argument.
+
+```julia
+read("rosalind_fib.txt", String)
+```
+
+One tricky gotcha is that the rosalind text files
+have a trailing newline character at the end of the file.
+This can be removed using the `strip()` function.
+
+Finally, because of the format of this input in particular,
+we'll use the `split()` function to split the string into two parts,
+and then convert each part to an integer using the `parse()` function.
+
+```@example fib
+function read_fib(file)
+    numbers = read(file, String)
+    numbers = strip(numbers) # remove trailing newline
+    n_str, k_str = split(numbers, ' ') # split on spaces
+    return parse(Int, n_str), parse(Int, k_str)
+end
+
+(n,k) = read_fib("problem_inputs/rosalind_fib.txt")
+fib(n, k)
+```

From f3ab3271bea1ab7b379fd8ee2010d82089d2db6c Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Mon, 10 Mar 2025 14:19:44 -0400
Subject: [PATCH 3/6] get started on GC content

---
 docs/src/rosalind/05-gc.md | 123 +++++++++++++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 docs/src/rosalind/05-gc.md

diff --git a/docs/src/rosalind/05-gc.md b/docs/src/rosalind/05-gc.md
new file mode 100644
index 0000000..89ac7fc
--- /dev/null
+++ b/docs/src/rosalind/05-gc.md
@@ -0,0 +1,123 @@
+# 🧮 Computing GC content
+
+!!! warning "The Problem"
+    The GC-content of a DNA string is given by the percentage of symbols
+    in the string that are 'C' or 'G'.
+    For example, the GC-content of "AGCTATAG" is 37.5%.
+    Note that the reverse complement of any DNA string has the same GC-content.
+
+    DNA strings must be labeled when they are consolidated into a database.
+    A commonly used method of string labeling is called FASTA format.
+    In this format, the string is introduced by a line that begins with '>',
+    followed by some labeling information.
+    Subsequent lines contain the string itself;
+    the first line to begin with '>' indicates the label of the next string.
+
+    In Rosalind's implementation,
+    a string in FASTA format will be labeled by the ID "Rosalind_xxxx",
+    where "xxxx" denotes a four-digit code between 0000 and 9999.
+
+    _Given_: At most 10 DNA strings in FASTA format (of length at most 1 kbp each).
+
+    _Return_: The ID of the string having the highest GC-content, followed by the GC-content of that string.
+    Rosalind allows for a default error of 0.001 in all decimal answers unless otherwise stated;
+    please see the note on absolute error below.
+
+    **Sample Dataset**
+    ```
+    >Rosalind_6404
+    CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
+    TCCCACTAATAATTCTGAGG
+    >Rosalind_5959
+    CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
+    ATATCCATTTGTCAGCAGACACGC
+    >Rosalind_0808
+    CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
+    TGGGAACCTGCGGGCAGTAGGTGGAAT
+    ```
+
+    **Sample Output**
+    ```
+    Rosalind_0808
+    60.919540
+    ```
+
+There are are really 3 parts of this problem.
+
+1. Parse the input, which is in a common biological format, FASTA
+2. Calculate the GC content
+3. Iterate through the input, keeping track of the largest GC content.
+
+Let's start with part 2, since it's quite similar to something we solved
+[in the very first problem](@ref "🧬 Problem 1: Counting DNA nucleotides").
+
+## Calculating GC content
+
+Just as when we were counting the frequency of each base,
+here, we can calculate the GC content
+by simply counting the number of G's or C's
+and dividing that number by the total length of the sequence.
+
+As a reminder, the `x-> do stuff` notation is an "anonymous function",
+which we're using as a "predicate" for the `count()` function.
+That is, `count()` will return the sum of elements where the predicate
+returns `true`.
+
+```@example gc
+my_seq = "AACCGGTTCT"
+
+function gc_content(seq)
+    gcs = count(base-> base in ('G', 'C'), seq)
+    return gcs / length(seq)
+end
+
+gc_content(my_seq)
+```
+
+A few things to note about this -
+There's no validation of the input,
+so this function will happily count the capital G's and C's
+of any string:
+
+```@example gc
+gc_content("Goodbye, Cruel World!")
+```
+
+But for now, this will do.
+
+### BioSequences method
+
+As with many of these problems,
+there is built-in functionality in the `BioSequences.jl` package.
+But before we get there,
+let's take a look at another problem with being permissive
+in our type signature up above:
+
+```@example gc
+bioseq = LongDNA{2}(my_seq)
+gc_content(bioseq)
+```
+
+What's going on here?
+The definition above tries to count the `Char`s 'G' and 'C',
+but when we iterate a `LongDNA`, we get back nucleotide types.
+So none of them match, providing a count of ``0``.
+We could modify the original function to look also for `DNA_C`,
+`DNA_G`, `RNA_C`, and `RNA_G`,
+or we can use the built-in predicate function `isGC()` from BioSequences,
+which returns `true` if it encounters any G or C nucleotide.
+
+```@example
+function gc_content(seq::LongNuc) # this type matches LongDNA or LongRNA
+    gcs = count(isGC, bioseq)
+    return gcs / length(seq)
+end
+
+gc_content(bioseq)
+```
+
+## Parsing the file
+
+We need to get this part done next.
+We started talking about dealing with files in the [last problem](@ref "♻️ 🐇 Rabbits and Recurrence Relations"),
+but let's go into a bit more detail.

From 4ca483d56b66fead9f37c1c6678c5b73d0d92369 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Mon, 10 Mar 2025 16:04:13 -0400
Subject: [PATCH 4/6] finish up gc

---
 docs/src/rosalind/05-gc.md | 221 ++++++++++++++++++++++++++++++++++++-
 1 file changed, 220 insertions(+), 1 deletion(-)

diff --git a/docs/src/rosalind/05-gc.md b/docs/src/rosalind/05-gc.md
index 89ac7fc..ee9d317 100644
--- a/docs/src/rosalind/05-gc.md
+++ b/docs/src/rosalind/05-gc.md
@@ -94,6 +94,8 @@ let's take a look at another problem with being permissive
 in our type signature up above:
 
 ```@example gc
+using BioSequences
+
 bioseq = LongDNA{2}(my_seq)
 gc_content(bioseq)
 ```
@@ -107,7 +109,7 @@ We could modify the original function to look also for `DNA_C`,
 or we can use the built-in predicate function `isGC()` from BioSequences,
 which returns `true` if it encounters any G or C nucleotide.
 
-```@example
+```@example gc
 function gc_content(seq::LongNuc) # this type matches LongDNA or LongRNA
     gcs = count(isGC, bioseq)
     return gcs / length(seq)
@@ -121,3 +123,220 @@ gc_content(bioseq)
 We need to get this part done next.
 We started talking about dealing with files in the [last problem](@ref "♻️ 🐇 Rabbits and Recurrence Relations"),
 but let's go into a bit more detail.
+
+When you want speed, there are a lot of tricks
+to directly parse the bytes of the file one-by-one.
+In fact, BioJulia developers created the package `Automa.jl`
+to create fast parsers, and Jakob Nissen,
+one of the co-administrators of BioJulia,
+is currently working on modifications to Julia Base
+to make dealing with files and byte streams even faster.
+
+But for now, let's just do the easy thing,
+and parse the file line-by-line using the `eachline()` function.
+This function returns an iterator over the lines of the file,
+one at a time,
+that we can use in a `for` or `while` loop.
+
+To begin with, let's write a function
+that just goes through the whole file,
+putting each sequence record into a `Vector`.
+
+
+```@example gc
+function parse_fasta(file)
+    records = [] # this is a Vector of type `Any`
+    record_name = ""
+    sequence = ""
+    for line in eachline(file)
+        if startswith(line, ">")
+            !isempty(record_name) && push!(records, (record_name, sequence))
+            record_name = lstrip(line, '>')
+            sequence = ""
+        else
+            sequence *= line
+        end
+    end
+    push!(records, (record_name, sequence))
+    return records
+end
+
+fake_file = IOBuffer("""
+    >Rosalind_6404
+    CCTGCGGAAGATCGGCACTAGAATAGCCAGAACCGTTTCTCTGAGGCTTCCGGCCTTCCC
+    TCCCACTAATAATTCTGAGG
+    >Rosalind_5959
+    CCATCGGTAGCGCATCCTTAGTCCAATTAAGTCCCTATCCAGGCGCTCCGCCGAAGGTCT
+    ATATCCATTTGTCAGCAGACACGC
+    >Rosalind_0808
+    CCACCCTCGTGGTATGGCTAGGCATTCAGGAACCGGAGAACGCTTCAGACCAGCCCGGAC
+    TGGGAACCTGCGGGCAGTAGGTGGAAT
+    """
+)
+
+records = parse_fasta(fake_file)
+```
+
+So we start with an empty `Vector` called `records`,
+and empty `String`s representing the record name and sequence.
+we need to assign these variables outside the "scope" of the loop,
+otherwise they won't persist outside of it.
+We'll talk more about "scope" another time.
+
+Then, we go through each line of the file,
+checking if it starts with `">"`.
+If it does, we push the current record into the `records` vector,
+and reset the `record_name` and `sequence` variables.
+If it doesn't, we append the line to the `sequence` variable
+(in julia, we combine strings with the `*` operator).
+
+The `isempty(record_name)` check in the 7th line of the function is used
+for the first record, to avoid pushing the initial empty strings,
+and the final `push!(records, (record_name, sequence))`
+is to deal with the final sequence,
+since the loop will terminate without encountering a final `>`.
+
+!!! tip "Our 'fake file'"
+    To show off how this works, we're using an `IOBuffer`,
+    which in most cases works pretty similar to an open file buffer.
+    One thing to keep in mind is that, unlike a `String` or other data structure,
+    the reading through of an `IOBuffer` moves the "read head" -
+    in other words, there's a pointer to the end of the file after we've read through it.
+
+    So if we run our function again:
+
+    ```@example
+    records = parse_fasta(fake_file)
+    ```
+
+    We can see that the `records` vector only contains a Tuple with empty strings,
+    since the loop never runs - `eachline(file)` here doesn't have any entries.
+
+    If you need to use it again, you can reset the `IOBuffer` using the `seekstart()` function:
+
+    ```@example
+    seekstart(fake_file)
+    records = parse_fasta(fake_file)
+    ```
+
+At this stage, we could take our `records` Vector,
+find the maximum gc entry, and then get that record.
+As with many problems, there are a lot of ways to do this,
+but we'll try the `findmax` function.
+This function takes a function as the first argument,
+and returns the "index" of the maximum value after using that function on the entry.
+
+The "index" is an integer that we can use to access a particular entry
+in a vector or other container.
+In julia, the syntax for indexing is `container[index]`.
+And notice that, since each of our `records` are themselves containers (`Tuple`s),
+we can access the first element of each record (the identifier) using `record[1]`,
+and the second element of each record (the sequence)using `record[2]`.
+
+!!! tip "One-based indexing"
+    Julia uses one-based indexing, meaning that the first element of a container is at index 1,
+    not 0 as in some other languages like Python.
+
+```@example gc
+i = findmax(record -> gc_content(record[2]), records)
+```
+
+```@example gc
+top_gc = records[i][1] # the first index gets the record, and the `[1]` gets the identifier
+```
+
+!!! tip "do-block syntax"
+    Many functions in Julia take a function as the first argument,
+    and sometimes the `->` anonymous function syntax is a bit annoying to use.
+    Instead, one can use the `do` block syntax.
+    This is a way to write a more complicated anonymous function,
+    eg one with multiple lines.
+
+    The structure of it is:
+
+    ```julia
+    some_func(iterator) do args # args is/are the argument(s) to the function
+        # function body
+    end
+    ```
+
+    In other words, our `findmax` function above could have been written as:
+
+    ```julia
+    i = findmax(records) do record
+        gc_content(record[2])
+    end
+    ```
+
+### Don't store the whole file in memory
+
+If you end up with a really large file,
+storing every record in memory,
+and then iterating over it a second time to calculate the GC content
+may not be the best approach.
+Instead, you can use a streaming approach,
+where you read the file line by line,
+and calculate the GC content on the fly.
+This way, you only need to keep one record in memory at a time,
+and you can find the record with the highest GC content without having to store all records in memory.
+
+```@example gc
+function streaming_gc(file)
+    max_gc = 0.0
+    max_id = ""
+
+    current_id = ""
+    current_seq = ""
+    for line in eachline(file)
+        if startswith(line, ">")
+            if length(current_seq) > 0 && gc_content(current_seq) > max_gc
+                max_gc = gc_content(current_seq)
+                max_id = current_id
+            end
+            current_id = lstrip(line, '>')
+            current_seq = ""
+        else
+            current_seq *= line
+        end
+    end
+    if length(current_seq) > 0 && gc_content(current_seq) > max_gc
+        max_gc = gc_content(current_seq)
+        max_id = current_id
+    end
+    return max_id
+end
+
+seekstart(fake_file)
+streaming_gc(fake_file)
+```
+
+### BioSequences method
+
+As you might imagine,
+parsing FASTA files is something we do all the time,
+so there's a package for that.
+In julia, that package is called [`FASTX.jl`](https://github.com/BioJulia/FASTX.jl)
+(there's also a "FASTQ" file format).
+
+This package provides FASTA "readers" and "writers",
+and return iterators that validate recores and return
+data structures containing the sequeince identifiers and `BioSeqeunce`s.
+
+```@example gc
+using FASTX
+
+function fastx_gc(buffer)
+    max_gc = 0.0
+    max_id = ""
+
+    FASTAReader(buffer) do reader
+        for record in reader
+            gc = gc_content(sequence(record))
+            if gc > max_gc
+                max_gc = gc
+                max_id = identifier(record)
+            end
+        end
+    end
+    return max_id
+end

From a0129fbe466830c4fd0774f287a9913756f0f6a1 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Mon, 10 Mar 2025 16:23:27 -0400
Subject: [PATCH 5/6] add input file

---
 .../rosalind/problem_inputs/rosalind_gc.txt   | 116 ++++++++++++++++++
 1 file changed, 116 insertions(+)
 create mode 100644 docs/src/rosalind/problem_inputs/rosalind_gc.txt

diff --git a/docs/src/rosalind/problem_inputs/rosalind_gc.txt b/docs/src/rosalind/problem_inputs/rosalind_gc.txt
new file mode 100644
index 0000000..d0858f5
--- /dev/null
+++ b/docs/src/rosalind/problem_inputs/rosalind_gc.txt
@@ -0,0 +1,116 @@
+>Rosalind_1584
+TCTGTGTGGAGGGGTAATAACGCACTGCTCATGCTCGGGAAGCGCTAATGACGTTTGTAG
+CAAAACGTCAAACGAGTCACCTCTGATAGCTAAACGCCCTGTATCAGGGCCGATGAACAC
+TCACGCCTCCAAAACCAGTTAGAGCAATTTCCAGGCGCGCAAGAACATCGGCTCTCGATG
+CATAGAGACCGACCCGAGACAAGTCCGCGGGCTCAATGCCCGATCCTTTTACGTAGCACG
+ATTACCCATTTCGCCTAACGATGGTTCGCAGGTTCAGAGTTAACGCCGTAGTACATGAAT
+CCCTAGTAATATTGCCATTGTCCCTTATGCCCCGAGGTGCAACTACGACTGTCTTACAAT
+TAAAACTAGACGCAGCATTGTCCTAAGACCCACGTATCAGCAGGTCACCTATCCGACCTG
+CTTGACCCTGTCCTTCTCGGTTGTCGGAGCGACTCGCCGGAGTACGAATAGCCATAAAGT
+AATAAAGGGAACGGTGTAGATCACGAGCTAAATTCGCTTAATCGCATGCTGAAGGGAGGG
+GATCCGCACCATCATTATATGACAGAGCACGGGTGCCCCTTGCGCGATTTGTCACTCGTC
+TTACATTGCCTTCCAACGCAGCCAGGGAGGACAGAAACGTCGTACGACTAAGCAGCGTGA
+CGGATGTGACACACGACACCAAAATAAACTATGGCGCCTAACAGTTACTCAAATGAGAAT
+CTTAGATTTATAAAGTGTTGTCACCACAGGGGGAAGTGCCGGTTAGCCTGATCAAAGCCG
+TAATTAAACCCAGCCGTATTTGTTTGGCTTATGCCGGGACGCGGCGCTTCTTCTTGGTTA
+CCGCTCTTGGCCCCGGGTTCCGCCTGGCACCCCATCTA
+>Rosalind_0632
+ACAAACGTTGCAATGCAGCAGCTCGCATACCAAAACCCCTGTGGCCTACCGGTCATTAAC
+TCGGTTTCCATCAAGTCGGCGGGCATACTGAAAGTGCAGACGCCCCGGGACAATGATTCA
+CCATACTAACAGAGAAAGGGCTCTCGGTAAGCCGGGTCCCTGCGAACCCTGTAGACTTCC
+ATTTCTAAGTGGATAACCCATCGAATTGTTCGACGGCTTTTAGAATGAGTTTTTCGAGCC
+ACACTGCTGTCGCGGTTCGACATAAGGCCCGTCAGGCCCCTATTACTATTATTTAGCTGT
+ATTTACAACCCCAACTCGAGTTTTATCGTTCGTGGAAACCGGTTTAGCACCGGCTCTGAG
+AAGCAGTAGCGCCGTATTTCGCGCTCAATACTTAAACAAATATGATAGCTGATTACCCTA
+AGGCCTAGGCGTTAGCGATGTGATTACATGTATCGGAGGCCTTTGTGTCTCGTTACAGAA
+AGAAGGACTCGGTAAAGCTGAAGTGATTTTGATAACAATCATATTCAATTAGGGTTCTAT
+GGTTTCAGAGGAGACGGAGGGCTTGGCAGACTCTTTTAAGTCGTCTGCGGTAATACAATT
+ATTAAAACCAAGGAACAACAGCCCTGGAGGCTGAATGCTGGAATAGTTTAACACCATTAT
+ACTTTGGGCAGTTCCAGCGCGGTACATTAATTCAAGAGATGTGCCATATCGACAAATGTC
+CAAGAATCTATTAAGAAATAAGCCATTTTGTAGGCTAACGATGAGCTCCCCAGCCGCATC
+CTTCATGAGTTTAAACTTTCCGTCGCGATGTACTGCCACGGGGTACGCCAAGCTGTATCA
+CCCGTTGGGGTAATCCCGCGGCCCCGGAGTGTTTAACGTCCAAATAGGGGTACAAGCAAG
+TGGGCGTGTGGTTCGGGCGTGACAAGCCCCCGTGGATAAATGTATAAAGGGTATTGGTTT
+CTTCAGTACAC
+>Rosalind_9974
+ATTGTGCGCTACATCCCAGCGAGCCACGGATGGACAAGAACATACCTTTCCGGAGCACGG
+CCGGGATTAAGGTTTCGTGAGACCCTGGGATGTGCTGTTGTCAATCAAGTTTGGAATGTC
+AACGCAGCCACCACGCTGCTTATAATAGGCATGGCTCCTCGTTCGTGATCAGGGGGGTTG
+AGAGTCTGCACTTAGCTATATTGCATGCGAGCTCTACTCATAGCACCATACGTAGAAGGT
+CCTTCCCGTAATACTTACAAACCCATCCGTCATAAGAGTTGTGTTCACGTTGCACATCAT
+ACCACAACTGCCGCAGTCGCCATCGGTTCAGCATTAGTTGCGTAGGACATAAAAGAGGCT
+TTAATCCCCCCTCGAACCTGCCACTTGCTGTGTGAGTCAAGGCTTCTCCGCCACGCGCGG
+TCAAATTCGTGAGACAATCGTAAGGCTCTGCGTTTGCTGCGAACGGGCGCTAAATAATTG
+CTACTGTGGTAGGATGACCGAGGGAGGAACAGCGATCCTACTGCACTCCGATAATGCATC
+GTGTTGAAAGCAGAATCGTCTTTAAAGCAAAGTGGTCAGGAGGGGATGACCCTTTCCCTT
+ACCGCATAGCGCCCCCACTTTCTGCGAGTCCCTAGCTAACTTAACCGCATCTGAAACTAA
+AACCTGCTTGCGCGGCTCATCATGCCGGGGTTTAATTGCTACGCGGGCGTTCCAGAGACT
+GTAGTGGTCTTGCGTCACAAGCACCCTCTATGAATTACAATTGAGGCGCTGATCTAGGAC
+CCGCCTTGCGTTCTACCAAACTTCCTTTAGTATGCCCCGGATGCATTAGATCATGCTAAG
+ACAAAAGTTCTGTTGTTGCACGTAGAAAGGCGTTTATGCGTACGTATGTAGCGT
+>Rosalind_0894
+GCACAGATAGCGCTGCATCGCAAGTGTCCAAAGCGAAGGTCTAGGTAAGAAGGGATAACT
+GGATGTTGATACGCCAGAATAGTGCCGCTGCACTTAGTGCCGGCATCTGTTGTCAAGTGC
+AAGCTGTAGGGCTTCTCGCTAAACGAGACGTTCCCCGCGAAGCTGGGTCCACCAGAAACC
+CTGTGTCAGCTGCGCGTCAAGTATCGCAACGCATTTTAGTTGAAGTGTCGGTGTGCTGAG
+CTTCTCCAGCTGAAATAACATATTGAACTTGACAGCCCCGCACGGTGAAACTAACGTTGA
+CCCGCCACCTCCGAGATGAAGCGTGCTCGCGCCGACTGGAGCTGATCGGTCAATTATTCA
+GTATAGTACTGATGGTCAACGATGACCCCAAATGAATCTTCGTGTGAATTAGTGTCAACG
+GTGCCCGGTGGGATGGATGTAGCTAAGGGGGCGCGTTGGTGGGAAAAGGGAAGCTGATCC
+CTAACAGGCGATTTCTGTAGAAGTCCGTACTAACGTGTCGGTTACCGCGCTCGGGTGAAC
+AGGTCCCGACCTGCGCGTACGGTAGGAGATTGCAGCTTTAGGTTACTTCCACGCCGATTG
+TTTAACTAACTACTACAAGAGCCTAAGACATGATGGGCGAGAAATGGCGTGTCCCTTATG
+GGTGTAGTGTCTTCCCCGTGATACGACTGTGACCGCTGCCCTCCCGCGAGCCTAGCCCTT
+CTCTGATGTCAACGAACGATCAACACTTAGAATAGGGCTCCCAGACTATTTGTCGCCGTG
+TGGCATACCACGAAATCTTACAATTACGGCCATGTTGAGGTACTAGAGCTTCATGGCAAT
+TTCATACTATCTGTCATAATAACTTCTGAACTCCAAGTCGTGCTTTGAGAAAAAACCTGC
+CAGGTTG
+>Rosalind_6334
+CCCAATTTATTCCGTCGAATTGGAATGATACCCGATGTCCACGTCCTTGTCGCTTGCAAT
+CATAATTACTCGTTGTCAATCCGTAAGCTCAGGCAGCCCTGCGGGGCCACCCGACCGGCT
+GAACCGACCCCATGCGTATTTGATTCTTCTTCGAAGCTCCACCTAACGCGACTGTCTGCT
+GAGACAAAGCGACGTATTATATATTAGGACCGGGTCTTGCATCTAGTCTTTGATTGAAAC
+CAACTGCGGTTCTGCATTTGAATTCAGCGGCGGTCCTATTCGGGCAATAGCTCCCCTTAA
+TAGGTACTAAGGCCCCTGCCTTGTATAGTACCTTGGGCTCTATATACTTGGAAGTATCCT
+TTCATAGCCGCGCCTGCCGGTTAGAGACCGTGCTGTCTCATTGGTCTGTGAGAAGTCGTG
+CTCGTTGACCGTAGGCATTTCTCTTTGTGGCCCTAACACATCGTTGCGTTTCGTCGTGTT
+ATTGACAAATACAGTGCGCCGGTGTTGCGAGAGGCGGACAACCTGACCGTACACCCTAAT
+GGTGTCGGTTAGGGATCATAGTCAACATCAAAGTCGGTTCCCCTCAAAGTTGGGAATGGG
+GTATCACCACGGATGTAACTTACAAACATATGGACTATCTATACTTGGAGAGCATGTTAC
+GGGCGGGACTTGCATTAGTTCTTACTCGTCCATATAAAACACCATTATACTTATCCGACG
+GCTGTTCCGTACATCATTCCTACCTAAATACACCCTTTGACTACAGACCTCACCAAGAGC
+TAGGAACAACCTGAGCATTCACCTCGCCGACAGTAGTCGTCTGTAATTTCCTGGACGTCA
+TGCCAAATTAACCGTGTAGTTGGGATCCGGAGATTGACGGACTGGCTGGTTG
+>Rosalind_2287
+GCCCTAGTTGGGACTGTAACGGAAAGCGTGACAATGGTAAGAAAGATTCCTAACGACCCC
+CGTCAGCGGTACTCAACGCAAGTCTACAGAACCACTCGCCGCACTTCCGGTCATATGTTG
+TCCTCCAGTGTAGCCACTATTACAATCTGACGCTAGTGACACGCATATCAGAAATCTCTT
+ACAGCTAGCTCCAAAAGTAAGGCAGTATCACCTTATTCCGTAGTTTCACCCGCGGTTTCC
+GCATTGACGTATTGGTTCTCGACTGTAGAGGGGCATCCTTCCATAGTATTGCTTAGAGGC
+ATAACGTCGCTGCCTCAATAAGCTGAATGCTAGCTGGTCCGTTGTAATTACACTAGGCCA
+ATTGCTTAAAGCTGCAATACTTGGTGTCCAAGGACAATAACCTTCAGAACGGGGCACGGT
+ATGTTCAATGCTAAGAGTGCCCTCTACTGGTACTTCCGGCGCTTCGGCTGCGGAAGAAAC
+TCTAAGCAATCTGAACCACGTCACCCTTTTCTACATGAATGTATTTGCGACACCGTCAAC
+GACCGGGCGTCCTCTTCTTCTGCATGTCGTAGACCTTAGAAATCAAATAAGAGTGAGTGT
+AGCAGCGTGAAGGCGAATAATGCCGCCGGTTTAGCCGAACGGGCTGAGTTGTTTATTTGG
+GTCTTGTAGGGTATAAATGCCCAGTTGATTTTCAGTCGTAGTTAAGTCATGCATCAACGG
+CCAAGGTGAGTTGACCTGTGTAGGATATTGTGGGAACACCCAATATCATACTCGACATCT
+TTTCGCCGGCCACACTGATTATGGTCAAATGTGGTTTCGATGTGGTATGTTCCGAGAACT
+TACCGCAC
+>Rosalind_5466
+CGCACGCCGCCCGAGTTAGGAGCTAGTCGTATTACCGCTTTAATAAAGCTGCAGAAATGT
+CTGGAGTCCACTCCCACATGGTGTACTTACCGCGCCGCAGTTACAGTCCTGATTTCCCAG
+ACAACGATAAGGATGATTCACTCAGATCAGCCCTATCGTTCATACCTACGTACCGACGCT
+ATGCCTTTTCTTAGATGTAGTGTCAAGTAGAGGACCAACTTAGTGGACAGCAGTATGTCC
+CAGGTACCCCTCACAGTATAGGCCCGCTCGGAAGCGCCCCTATCAGGCTCTAAAGCAGCG
+ATAATCGATCCGGGCAACTTGCCACATCACCAGGGAGTGAAGTCAGGCTCCTGGTTTCCG
+CCTTAGCAGTTTGCATGCTGCTTACACGCGAGTCTCTAAAGGACACGTTCACGAGTCTGC
+CCTCCTATCAGAGCCGTAAAGCGAGTTGGTAGTCGGATGATGGGTCGTTACCGTGCGACA
+GTAGTTTACGTCTCGAGGAAAAAGTATCATATCCATGAACTAGGTGACCCCTATCATCTG
+TTAGCCAACCGAGCATTCAGCACCCTTGAGTGATCACTATACCGAACCGACCATCTGGGA
+CTAAACGGTACGCATTCATCAGCTTCCGCCTTTTCTCGGCTCATCGAACGGTAGTCGATA
+CAGGGGATTCCACACGGGACTAAGTGATCGTGTGACCATGTCCGTGCGTCTAATTCTACG
+GTAGCGAAGCGGGGAGGTTCCTCTTCGAAACCGCAGTCGAAGATGCCATCGGCCCTTGTC
+CCACATGTCGAGTCGACTCGGACGACCAAATCTCAGCCGCGACGGTCAACTCTAAGGGTT
+CTGGACTACAGTCGACTTATCACTGACGCCTCTAGGATGCGTGTTGCTAGTATTTTACAC
+AGTG

From 6ed541a7c61e337ec8d526619e1e4350206d23d7 Mon Sep 17 00:00:00 2001
From: Kevin Bonham <kevin@bonham.ch>
Date: Mon, 10 Mar 2025 16:26:11 -0400
Subject: [PATCH 6/6] remember to save

---
 docs/src/rosalind/05-gc.md | 80 ++++++++++++++++++++++++++++----------
 1 file changed, 59 insertions(+), 21 deletions(-)

diff --git a/docs/src/rosalind/05-gc.md b/docs/src/rosalind/05-gc.md
index ee9d317..1418349 100644
--- a/docs/src/rosalind/05-gc.md
+++ b/docs/src/rosalind/05-gc.md
@@ -66,12 +66,12 @@ returns `true`.
 ```@example gc
 my_seq = "AACCGGTTCT"
 
-function gc_content(seq)
+function myGC(seq)
     gcs = count(base-> base in ('G', 'C'), seq)
     return gcs / length(seq)
 end
 
-gc_content(my_seq)
+myGC(my_seq)
 ```
 
 A few things to note about this -
@@ -80,7 +80,7 @@ so this function will happily count the capital G's and C's
 of any string:
 
 ```@example gc
-gc_content("Goodbye, Cruel World!")
+myGC("Goodbye, Cruel World!")
 ```
 
 But for now, this will do.
@@ -97,7 +97,7 @@ in our type signature up above:
 using BioSequences
 
 bioseq = LongDNA{2}(my_seq)
-gc_content(bioseq)
+myGC(bioseq)
 ```
 
 What's going on here?
@@ -110,11 +110,17 @@ or we can use the built-in predicate function `isGC()` from BioSequences,
 which returns `true` if it encounters any G or C nucleotide.
 
 ```@example gc
-function gc_content(seq::LongNuc) # this type matches LongDNA or LongRNA
+function myGC(seq::LongNuc) # this type matches LongDNA or LongRNA
     gcs = count(isGC, bioseq)
     return gcs / length(seq)
 end
 
+myGC(bioseq)
+```
+
+Or, even more simply, there's already a `gc_content()` function in BioSequences.jl:
+
+```@example gc
 gc_content(bioseq)
 ```
 
@@ -144,11 +150,11 @@ putting each sequence record into a `Vector`.
 
 
 ```@example gc
-function parse_fasta(file)
+function parse_fasta(buffer)
     records = [] # this is a Vector of type `Any`
     record_name = ""
     sequence = ""
-    for line in eachline(file)
+    for line in eachline(buffer)
         if startswith(line, ">")
             !isempty(record_name) && push!(records, (record_name, sequence))
             record_name = lstrip(line, '>')
@@ -205,7 +211,7 @@ since the loop will terminate without encountering a final `>`.
 
     So if we run our function again:
 
-    ```@example
+    ```@example gc
     records = parse_fasta(fake_file)
     ```
 
@@ -214,7 +220,7 @@ since the loop will terminate without encountering a final `>`.
 
     If you need to use it again, you can reset the `IOBuffer` using the `seekstart()` function:
 
-    ```@example
+    ```@example gc
     seekstart(fake_file)
     records = parse_fasta(fake_file)
     ```
@@ -238,7 +244,7 @@ and the second element of each record (the sequence)using `record[2]`.
     not 0 as in some other languages like Python.
 
 ```@example gc
-i = findmax(record -> gc_content(record[2]), records)
+i = findmax(record -> myGC(record[2]), records)[2]
 ```
 
 ```@example gc
@@ -281,17 +287,20 @@ This way, you only need to keep one record in memory at a time,
 and you can find the record with the highest GC content without having to store all records in memory.
 
 ```@example gc
-function streaming_gc(file)
+function streaming_gc(buffer)
     max_gc = 0.0
     max_id = ""
 
     current_id = ""
     current_seq = ""
-    for line in eachline(file)
+    for line in eachline(buffer)
         if startswith(line, ">")
-            if length(current_seq) > 0 && gc_content(current_seq) > max_gc
-                max_gc = gc_content(current_seq)
-                max_id = current_id
+            if length(current_seq) > 0
+                current_gc = myGC(current_seq)
+                if current_gc > max_gc
+                    max_gc = current_gc
+                    max_id = current_id
+                end
             end
             current_id = lstrip(line, '>')
             current_seq = ""
@@ -299,9 +308,12 @@ function streaming_gc(file)
             current_seq *= line
         end
     end
-    if length(current_seq) > 0 && gc_content(current_seq) > max_gc
-        max_gc = gc_content(current_seq)
-        max_id = current_id
+    if length(current_seq) > 0
+        current_gc = myGC(current_seq)
+        if current_gc > max_gc
+            max_gc = current_gc
+            max_id = current_id
+        end
     end
     return max_id
 end
@@ -329,14 +341,40 @@ function fastx_gc(buffer)
     max_gc = 0.0
     max_id = ""
 
-    FASTAReader(buffer) do reader
+    FASTAReader(buffer) do reader # see the "tip" above about do-blocks
         for record in reader
-            gc = gc_content(sequence(record))
+            gc = gc_content(sequence(LongDNA{2}, record))
             if gc > max_gc
                 max_gc = gc
                 max_id = identifier(record)
             end
         end
     end
-    return max_id
+    return max_id, max_gc
+end
+
+seekstart(fake_file)
+fastx_gc(fake_file)
+```
+
+## Reading the file
+
+So far, we've been dealing with an IOBuffer,
+which is what we get when we open a file.
+But to actually solve the rosalind problem,
+we need to open a file on the disk.
+
+In julia, we do this with the `open` function,
+which takes a file path, and optionally a function to call with the file handle.
+If we don't provide a function, we get an IOBuffer just as we've been using,
+but we need to be sure to close it when we're done.
+
+```@example gc
+function file_gc(filename)
+    max_id, max_gc = open(fastx_gc, filename)
+    println(max_id)
+    println(100 * max_gc)
 end
+
+file_gc("problem_inputs/rosalind_gc.txt")
+```