From b632667ce57af89691407bb8668e1512775278ae Mon Sep 17 00:00:00 2001 From: Calvin Date: Fri, 15 Mar 2013 15:26:20 -0400 Subject: nbc added --- src/nbc/GPL-3 | 676 ++++++++++++++++++++++++++++++++++++++ src/nbc/LICENSE | 45 +++ src/nbc/Makefile | 16 + src/nbc/NEWS | 3 + src/nbc/README | 114 +++++++ src/nbc/binary.sml | 26 ++ src/nbc/build.sh | 4 + src/nbc/clean.sh | 2 + src/nbc/count | Bin 0 -> 572503 bytes src/nbc/count.ml | 60 ++++ src/nbc/count.mlb | 10 + src/nbc/count.sml | 290 ++++++++++++++++ src/nbc/fail.sml | 10 + src/nbc/fasta-all.mlb | 8 + src/nbc/fasta-test.mlb | 9 + src/nbc/fasta.mlb | 8 + src/nbc/fasta.sml | 326 ++++++++++++++++++ src/nbc/gene.mlb | 5 + src/nbc/gene.sml | 49 +++ src/nbc/genome.sml | 33 ++ src/nbc/gzip.c | 13 + src/nbc/gzip.sml | 164 +++++++++ src/nbc/history.mlb | 5 + src/nbc/history.sml | 74 +++++ src/nbc/judy.sml | 175 ++++++++++ src/nbc/kahan.sml | 31 ++ src/nbc/main.sml | 98 ++++++ src/nbc/matlab.sml | 135 ++++++++ src/nbc/misc.sml | 125 +++++++ src/nbc/nmer-all.mlb | 6 + src/nbc/nmer-test.mlb | 9 + src/nbc/nmer.mlb | 7 + src/nbc/nmer.sml | 557 +++++++++++++++++++++++++++++++ src/nbc/options.sml | 202 ++++++++++++ src/nbc/parse-state.mlb | 5 + src/nbc/parse-state.sml | 89 +++++ src/nbc/probabilities-by-read | Bin 0 -> 394061 bytes src/nbc/probabilities-by-read.mlb | 8 + src/nbc/probabilities-by-read.sml | 210 ++++++++++++ src/nbc/program.sml | 9 + src/nbc/promise.mlb | 5 + src/nbc/promise.sml | 24 ++ src/nbc/score.mlb | 27 ++ src/nbc/score.sml | 30 ++ src/nbc/sequence.sml | 74 +++++ src/nbc/stopwatch.sml | 24 ++ src/nbc/storejudy.sml | 17 + src/nbc/stream.mlb | 6 + src/nbc/stream.sml | 243 ++++++++++++++ src/nbc/substitution.grm | 50 +++ src/nbc/substitution.lex | 54 +++ src/nbc/substitution.sml | 26 ++ src/nbc/tabulate.ml | 190 +++++++++++ src/nbc/test-generate.sml | 1 + src/nbc/test-library.mlb | 5 + src/nbc/test-library.sml | 50 +++ src/nbc/tree.mlb | 6 + src/nbc/tree.sml | 225 +++++++++++++ 58 files changed, 4673 insertions(+) create mode 100644 src/nbc/GPL-3 create mode 100644 src/nbc/LICENSE create mode 100644 src/nbc/Makefile create mode 100644 src/nbc/NEWS create mode 100644 src/nbc/README create mode 100644 src/nbc/binary.sml create mode 100644 src/nbc/build.sh create mode 100644 src/nbc/clean.sh create mode 100755 src/nbc/count create mode 100644 src/nbc/count.ml create mode 100644 src/nbc/count.mlb create mode 100644 src/nbc/count.sml create mode 100644 src/nbc/fail.sml create mode 100644 src/nbc/fasta-all.mlb create mode 100644 src/nbc/fasta-test.mlb create mode 100644 src/nbc/fasta.mlb create mode 100644 src/nbc/fasta.sml create mode 100644 src/nbc/gene.mlb create mode 100644 src/nbc/gene.sml create mode 100644 src/nbc/genome.sml create mode 100644 src/nbc/gzip.c create mode 100644 src/nbc/gzip.sml create mode 100644 src/nbc/history.mlb create mode 100644 src/nbc/history.sml create mode 100644 src/nbc/judy.sml create mode 100644 src/nbc/kahan.sml create mode 100644 src/nbc/main.sml create mode 100644 src/nbc/matlab.sml create mode 100644 src/nbc/misc.sml create mode 100644 src/nbc/nmer-all.mlb create mode 100644 src/nbc/nmer-test.mlb create mode 100644 src/nbc/nmer.mlb create mode 100644 src/nbc/nmer.sml create mode 100644 src/nbc/options.sml create mode 100644 src/nbc/parse-state.mlb create mode 100644 src/nbc/parse-state.sml create mode 100755 src/nbc/probabilities-by-read create mode 100644 src/nbc/probabilities-by-read.mlb create mode 100644 src/nbc/probabilities-by-read.sml create mode 100644 src/nbc/program.sml create mode 100644 src/nbc/promise.mlb create mode 100644 src/nbc/promise.sml create mode 100644 src/nbc/score.mlb create mode 100644 src/nbc/score.sml create mode 100644 src/nbc/sequence.sml create mode 100644 src/nbc/stopwatch.sml create mode 100644 src/nbc/storejudy.sml create mode 100644 src/nbc/stream.mlb create mode 100644 src/nbc/stream.sml create mode 100644 src/nbc/substitution.grm create mode 100644 src/nbc/substitution.lex create mode 100644 src/nbc/substitution.sml create mode 100644 src/nbc/tabulate.ml create mode 100644 src/nbc/test-generate.sml create mode 100644 src/nbc/test-library.mlb create mode 100644 src/nbc/test-library.sml create mode 100644 src/nbc/tree.mlb create mode 100644 src/nbc/tree.sml (limited to 'src/nbc') diff --git a/src/nbc/GPL-3 b/src/nbc/GPL-3 new file mode 100644 index 0000000..4432540 --- /dev/null +++ b/src/nbc/GPL-3 @@ -0,0 +1,676 @@ + + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. + diff --git a/src/nbc/LICENSE b/src/nbc/LICENSE new file mode 100644 index 0000000..827c46b --- /dev/null +++ b/src/nbc/LICENSE @@ -0,0 +1,45 @@ +BSD: + +Copyright 2008, 2009, 2010 Drexel University +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: +1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +3. Neither the name of the University nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +SUCH DAMAGE. + +GPL: + +Copyright 2008, 2009, 2010 Drexel University + +This program is free software: you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation, either version 3 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program. If not, see http://www.gnu.org/licenses/. diff --git a/src/nbc/Makefile b/src/nbc/Makefile new file mode 100644 index 0000000..4f9c324 --- /dev/null +++ b/src/nbc/Makefile @@ -0,0 +1,16 @@ +MLTON = mlton +MLYACC = mlyacc +MLLEX = mllex +%.grm.sig %.grm.sml: %.grm + $(MLYACC) $^ +%.lex.sml: %.lex + $(MLLEX) $^ +%: %.mlb + $(MLTON) $(MLTONFLAGS) -output $@ $^ +all: count probabilities-by-read +count: count.mlb +probabilities-by-read: probabilities-by-read.mlb +score: score.mlb +tabulate: tabulate.mlb +clean: + rm -f count probabilities-by-read diff --git a/src/nbc/NEWS b/src/nbc/NEWS new file mode 100644 index 0000000..501c1bb --- /dev/null +++ b/src/nbc/NEWS @@ -0,0 +1,3 @@ +score and tabulate have been rewritten in Standard ML, to take advantage +of certain optimizations that the MLton compiler performs, and improve +performance. Their behavior should be unchanged, except for some speedup. diff --git a/src/nbc/README b/src/nbc/README new file mode 100644 index 0000000..d2a3688 --- /dev/null +++ b/src/nbc/README @@ -0,0 +1,114 @@ +This is the Naive Bayes Classifier, developed by the genomic signal +processing lab led by Professor Gail Rosen at Drexel University. + +It uses a method similar to that used in many email spam filters to +score a genetic sample against different genomes, to possibly identify +the closest match. The method is described in the paper <> + +REQUIREMENTS + +To compile this code you need the following (versions given are the +versions we used, but slightly older or newer versions probably work too): + +MLton 20100608 +GNU binutils 2.18.1 +GNU C Compiler 4.3.2 +GNU Make 3.81 +Judy 1.0.5 +zlib 1.2.3.3 + +It has been tested extensively on Mac OS X and Linux, on both 32-bit +and 64-bit processors. It probably works on other, similar operating +systems without any changes. 64-bit uses more memory but also allows +larger genomes to be used. No other differences between the 32-bit and +64-bit versions have been observed. It may work on Windows, but that has +not been attempted. + +Since it is now written in Standard ML, it may in theory be compilable +with other Standard ML compilers, such as Standard ML of New Jersey, +MLKit, PolyML, etc. We have not attempted this. Some changes would +probably be necessary since the MLton foreign function interface (used +for judy array and gzip support) is different from the interface used +by other compilers. + +BUILDING + +For all the example commands, the $ indicates the shell prompt. Don't type +the $, just everything after the $. And most of these examples should +not be typed in verbatim (unless you happen to have the genomes for a +unicorn and a wumpus lying around - in that case, lucky you!). Instead +modify the examples to suit your particular circumstances. + +Run "make" to build: + +$ make + +Assuming it completes without any problems, you will have three +programs: count, score, and tabulate. Install them somewhere in your path: + +$ cp count score tabulate /usr/local/bin + +SETUP + +The first step is to set up your genome data. Create a new directory, +for example "genomes", and inside that directory, create a directory +for each genome: + +$ mkdir genomes +$ mkdir genomes/Unicorn +$ mkdir genomes/Wumpus + +Then you run count on the FASTA files containing the genome (and any +plasmids), for each word size you want to score against: + +$ count -w genomes/Unicorn/15perword.gz -t genomes/Unicorn/15total \ + -r 15 Unicorn.fasta Unicorn_plasmid.fasta +$ count -w genomes/Unicorn/13perword.gz -t genomes/Unicorn/13total \ + -r 13 Unicorn.fasta Unicorn_plasmid.fasta +$ count -w genomes/Wumpus/15perword.gz -t genomes/Wumpus/15total \ + -r 15 Wumpus.fasta +$ count -w genomes/Wumpus/13perword.gz -t genomes/Wumpus/13total \ + -r 13 Wumpus.fasta + +SCORING + +Now, run score on your input file. Order 15 usually gives the best +results so we'll try that first: + +$ score -a semen_sample.fasta -r 15 -j genomes + +For this example, you would get two files: + semen_sample-15-Unicorn.txt + semen_sample-15-Wumpus.txt + +TABULATION + +For easy import into a spreadsheet, you can run tabulate to put it in +CSV format: + +$ tabulate semen_sample-15-Unicorn.txt semen_sample-15-Wumpus.txt + +This will create the files: + semen_sample-15-0.csv.gz + semen_sample-15-1.csv.gz + semen_sample-15-2.csv.gz +and so on. The exact number of files will depend on how big your input +file is. + +FURTHER INFORMATION + +Each command has a --help option, which may be helpful. + +BUGS + +count and score load the entire genome into memory. For large genomes this +requires a stupendous amount of memory. + +LICENSE + +It has been licensed under the <> license. +See the LICENSE file for details. + +FEEDBACK + +Any feedback should be directed to gailr@gmail.com. diff --git a/src/nbc/binary.sml b/src/nbc/binary.sml new file mode 100644 index 0000000..af5acd5 --- /dev/null +++ b/src/nbc/binary.sml @@ -0,0 +1,26 @@ +signature BINARY = sig + val fromInt32: int -> Word8Vector.vector + val fromInt16: int -> Word8Vector.vector + val fromReal: real -> Word8Vector.vector +end + +structure Binary :> BINARY = struct + val word8VectorFromArray = Word8ArraySlice.vector o Word8ArraySlice.full + fun fromInt32 i = + let + val array = Word8Array.array (PackWord32Little.bytesPerElem, 0w0) + val word = LargeWord.fromInt i + in + PackWord32Little.update (array, 0, word) + ; word8VectorFromArray array + end + fun fromInt16 i = + let + val array = Word8Array.array (PackWord16Little.bytesPerElem, 0w0) + val word = LargeWord.fromInt i + in + PackWord16Little.update (array, 0, word) + ; word8VectorFromArray array + end + val fromReal = PackRealLittle.toBytes +end diff --git a/src/nbc/build.sh b/src/nbc/build.sh new file mode 100644 index 0000000..3328271 --- /dev/null +++ b/src/nbc/build.sh @@ -0,0 +1,4 @@ +#! /bin/sh -v +mllex substitution.lex +mlyacc substitution.grm +mlton -link-opt -lJudy -link-opt -lz score.mlb gzip.c diff --git a/src/nbc/clean.sh b/src/nbc/clean.sh new file mode 100644 index 0000000..58d95bf --- /dev/null +++ b/src/nbc/clean.sh @@ -0,0 +1,2 @@ +#! /bin/sh -v +rm substitution.lex.sml substitution.grm.sig substitution.grm.sml score diff --git a/src/nbc/count b/src/nbc/count new file mode 100755 index 0000000..d1c5d04 Binary files /dev/null and b/src/nbc/count differ diff --git a/src/nbc/count.ml b/src/nbc/count.ml new file mode 100644 index 0000000..bd97b7b --- /dev/null +++ b/src/nbc/count.ml @@ -0,0 +1,60 @@ +module ExtArray = ExtArray.Array +let (|>) a b = b a + +module Options = struct + let parser = OptParse.OptParser.make ~version:"1.0" () + let per_word = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'w' ~long_name:"per-word" + ~help:"file to store per-word counts in" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let total = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'t' ~long_name:"total" + ~help:"file to store total count in" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let order = + let option = OptParse.StdOpt.int_option ~default:15 () in + OptParse.OptParser.add parser ~short_name:'r' ~long_name:"order" option; + (fun () -> OptParse.Opt.get option) + let files = OptParse.OptParser.parse_argv parser + let total = total () + let per_word = per_word () + let order = order () +end + +let load_file order judy total name = + Misc.io_of_gzip name |> Fasta.enum_words order + |> Enum.fold (fun word total -> + Judy.bump judy word; + Judy.bump judy (Gene.reverse word); + total + 2 + ) total +let load_files order names = + let judy = Judy.create () in + List.fold_left (load_file order judy) 0 names, judy +let gzip_output_string c s = Gzip.output c s 0 (String.length s) +let () = + let total_words, judy = load_files Options.order Options.files in + ( + let c = open_out Options.total in + output_string c (string_of_int total_words ^ "\n"); + close_out c + ); ( + let c = Gzip.open_out Options.per_word in + Judy.iter (fun word count -> + gzip_output_string c + (String.concat "" [string_of_int count; " "; word; "\n"]) + ) judy; + Gzip.close_out c + ) diff --git a/src/nbc/count.mlb b/src/nbc/count.mlb new file mode 100644 index 0000000..e0eb0e6 --- /dev/null +++ b/src/nbc/count.mlb @@ -0,0 +1,10 @@ +local + $(SML_LIB)/basis/basis.mlb + $(SML_LIB)/smlnj-lib/Util/smlnj-lib.mlb + stream.mlb + tree.mlb + nmer.mlb + fasta.mlb +in + count.sml +end diff --git a/src/nbc/count.sml b/src/nbc/count.sml new file mode 100644 index 0000000..d036050 --- /dev/null +++ b/src/nbc/count.sml @@ -0,0 +1,290 @@ +datatype sides = Single | Double +datatype labeled = Labeled | Unlabeled +local + (* + val perWord = ref NONE + val total = ref NONE + *) + val order = ref (SOME 15) + val sides = ref Double + val labeled = ref Labeled + val optionsWithoutHelp = [ + (* { + short = "w", long = ["per-word"] + , desc = GetOpt.ReqArg ( + fn file => perWord := SOME file + , "file" + ), help = "file to store per-word counts in" + }, { + short = "t", long = ["total"] + , desc = GetOpt.ReqArg ( + fn file => total := SOME file + , "file" + ), help = "file to store total count in" + }, *) { + short = "r", long = ["order"] + , desc = GetOpt.ReqArg ( + fn size => order := Int.fromString size + , "size" + ), help = "word size" + }, { + short = "1", long = ["single"] + , desc = GetOpt.NoArg (fn () => sides := Single) + , help = "only count one side" + }, { + short = "u", long = ["unlabeled"] + , desc = GetOpt.NoArg (fn () => labeled := Unlabeled) + , help = "emit counts for every possible nmer, without labels" + } + ] + fun usageString () = GetOpt.usageInfo { + header = CommandLine.name () ^ " ..." + , options = optionsWithoutHelp + } ^ "\n" + datatype status = Success | Failure + fun displayHelpAndExit status = ( + TextIO.output ( + TextIO.stdErr + , usageString () + ); OS.Process.exit (case status of + Success => OS.Process.success + | Failure => OS.Process.failure + ) + ) + val options = { + short = "h", long = ["help"] + , desc = GetOpt.NoArg (fn () => displayHelpAndExit Success) + , help = "display help" + } :: optionsWithoutHelp +in + val (_, files) = GetOpt.getOpt { + argOrder = GetOpt.Permute + , options = options + , errFn = fn errorMessage => ( + TextIO.output (TextIO.stdErr, errorMessage ^ "\n") + ; displayHelpAndExit Failure + ) + } (CommandLine.arguments ()) + (* + val perWordFileName = case !perWord of + NONE => ( + TextIO.output ( + stdErr + , "per-word file name required but not provided\n" + ); displayHelpAndExit Failure + ) | SOME fileName => fileName + val totalFileName = case !total of + NONE => ( + TextIO.output ( + stdErr + , "total file name required but not provided\n" + ); displayHelpAndExit Failure + ) | SOME fileName => fileName + *) + val order = case !order of + NONE => ( + TextIO.output ( + TextIO.stdErr + , "invalid order\n" + ); displayHelpAndExit Failure + ) | SOME integer => integer + val sides = !sides + val labeled = !labeled +end + +signature COLLECTION = sig + type collection + type nmer + val empty: unit -> collection + val add: collection * nmer -> unit + val get: collection * nmer -> int + val app: (nmer * int -> unit) -> collection -> unit +end + +functor Collection (Nmer: NMER) +:> COLLECTION where type nmer = Nmer.nmer = struct + type nmer = Nmer.nmer + structure Table = HashTableFn ( + type hash_key = nmer + val hashVal = Nmer.hash + val sameKey = Nmer.equal + ) + type collection = int ref Table.hash_table + exception NotFound + fun empty () = Table.mkTable (256 * 1024, NotFound) + fun add (table, nmer) = case Table.find table nmer of + NONE => Table.insert table (nmer, ref 1) + | SOME count => count := !count + 1 + fun get (table, nmer) = case Table.find table nmer of + NONE => 0 + | SOME (ref count) => count + fun app execute table = Table.appi (fn (nmer, ref count) => + execute (nmer, count) + ) table +end + +datatype result = Success | Failure + +signature OUTPUT = sig + type collection + val output: collection -> unit +end + +functor Unlabeled ( + structure Nmer: NMER + structure Collection: COLLECTION + sharing type Collection.nmer = Nmer.nmer +) :> OUTPUT + where type collection = Collection.collection += struct + type collection = Collection.collection + fun put string = TextIO.output (TextIO.stdOut, string) + fun single count = ( + put (Int.toString count) + ; put "\n" + ) + fun output collection = + let + fun continue nmer = ( + single (Collection.get (collection, nmer)) + ; + if nmer = Nmer.maximum then () + else continue (Nmer.next nmer) + ) + in + continue (Nmer.minimum) + end +end + +functor Labeled ( + structure Nmer: NMER + structure Collection: COLLECTION + sharing type Collection.nmer = Nmer.nmer +) :> OUTPUT + where type collection = Collection.collection += struct + type collection = Collection.collection + fun put string = TextIO.output (TextIO.stdOut, string) + fun single (nmer, count) = ( + put (Nmer.toString nmer) + ; put " " + ; put (Int.toString count) + ; put "\n" + ) + fun output collection = Collection.app single collection +end + +functor File ( + structure Collection: COLLECTION + structure Output: OUTPUT + sharing type Collection.collection = Output.collection +) :> FILE + where type nmer = Collection.nmer + where type result = result + where type argument = unit += struct + type argument = unit + type file = Collection.collection + type read = unit + type nmer = Collection.nmer + type result = result + fun startFile _ = Collection.empty () + fun startRead _ = () + fun nmer (counts, (), nmer) = Collection.add (counts, nmer) + fun stopRead (_, ()) = () + fun stopFile counts = ( + Output.output counts + ; Success + ) + fun invalidFormat file = Failure +end + +functor Everything (Nmer: NMER) = struct + structure Collection = Collection (Nmer) + structure Unlabeled = File ( + structure Collection = Collection + structure Output = Unlabeled ( + structure Nmer = Nmer + structure Collection = Collection + ) + ) + structure Labeled = File ( + structure Collection = Collection + structure Output = Labeled ( + structure Nmer = Nmer + structure Collection = Collection + ) + ) + structure SingleSidedUnlabeled = SingleSidedFasta ( + structure Nmer = Nmer + structure File = Unlabeled + ) + structure DoubleSidedUnlabeled = DoubleSidedFasta ( + structure Nmer = Nmer + structure File = Unlabeled + ) + structure SingleSidedLabeled = SingleSidedFasta ( + structure Nmer = Nmer + structure File = Labeled + ) + structure DoubleSidedLabeled = DoubleSidedFasta ( + structure Nmer = Nmer + structure File = Labeled + ) +end + +structure Everything32 = Everything ( + Nmer ( + val order = order + structure Word = Word32 + ) +) +structure Everything64 = Everything ( + Nmer ( + val order = order + structure Word = Word64 + ) +) + +val process = + if order <= 32 then (case sides of + Single => (case labeled of + Unlabeled => Everything32.SingleSidedUnlabeled.process + | Labeled => Everything32.SingleSidedLabeled.process + ) | Double => (case labeled of + Unlabeled => Everything32.DoubleSidedUnlabeled.process + | Labeled => Everything32.DoubleSidedLabeled.process + ) + ) else (case sides of + Single => (case labeled of + Unlabeled => Everything64.SingleSidedUnlabeled.process + | Labeled => Everything64.SingleSidedLabeled.process + ) | Double => (case labeled of + Unlabeled => Everything64.DoubleSidedUnlabeled.process + | Labeled => Everything64.DoubleSidedLabeled.process + ) + ) + +val () = + let + fun one name = + let + val instream = TextIO.openIn name + val result = process ((), instream) + in + TextIO.closeIn instream + ; case result of + Success => true + | Failure => ( + TextIO.output ( + TextIO.stdErr + , name + ^ ": invalid format\n" + ); false + ) + end + fun all names = List.all one names + in + if all files then () + else OS.Process.exit OS.Process.failure + end diff --git a/src/nbc/fail.sml b/src/nbc/fail.sml new file mode 100644 index 0000000..f3d324c --- /dev/null +++ b/src/nbc/fail.sml @@ -0,0 +1,10 @@ +signature FAIL = sig + val fail: string -> 'a +end + +structure Fail :> FAIL = struct + fun fail why = ( + TextIO.output (TextIO.stdErr, why ^ "\n") + ; OS.Process.exit OS.Process.failure + ) +end diff --git a/src/nbc/fasta-all.mlb b/src/nbc/fasta-all.mlb new file mode 100644 index 0000000..7bf2790 --- /dev/null +++ b/src/nbc/fasta-all.mlb @@ -0,0 +1,8 @@ +local + $(SML_LIB)/basis/basis.mlb + nmer.mlb + parse-state.mlb + test-library.mlb +in + fasta.sml +end diff --git a/src/nbc/fasta-test.mlb b/src/nbc/fasta-test.mlb new file mode 100644 index 0000000..564fd11 --- /dev/null +++ b/src/nbc/fasta-test.mlb @@ -0,0 +1,9 @@ +local + local + fasta-all.mlb + in + functor Test + end +in + test-generate.sml +end diff --git a/src/nbc/fasta.mlb b/src/nbc/fasta.mlb new file mode 100644 index 0000000..a308ac5 --- /dev/null +++ b/src/nbc/fasta.mlb @@ -0,0 +1,8 @@ +local + fasta-all.mlb +in + signature FILE + signature FASTA + functor SingleSidedFasta + functor DoubleSidedFasta +end diff --git a/src/nbc/fasta.sml b/src/nbc/fasta.sml new file mode 100644 index 0000000..8089025 --- /dev/null +++ b/src/nbc/fasta.sml @@ -0,0 +1,326 @@ +signature FILE = sig + type argument + type file + type read + type nmer + type result + val startFile: argument -> file + val startRead: file * string -> read + val nmer: file * read * nmer -> unit + val stopRead: file * read -> unit + val stopFile: file -> result + val invalidFormat: file -> result +end + +signature FASTA = sig + type argument + type result + val process: argument * TextIO.instream -> result +end + +functor AgnosticFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer + structure Sides: sig + include NMER_SIDES + type file + type read + val process: file * read * sides -> unit + end + sharing type Nmer.base = Sides.sidesBase + sharing type Nmer.nmer = Sides.sidesNmer + sharing type File.read = Sides.read + sharing type File.file = Sides.file +) :> FASTA + where type argument = File.argument + where type result = File.result += struct + type argument = File.argument + type result = File.result + + val beforeHeaderBeginningOfLine = ParseState.create () + val beforeHeaderMiddleOfLine = ParseState.create () + val afterHeaderBeginningOfLine = ParseState.create () + val afterHeaderMiddleOfLine = ParseState.create () + + fun inputLineButDiscardNewline instream = + Option.map (fn line => + String.extract (line, 0, SOME (size line - 1)) + ) (TextIO.inputLine instream) + datatype z = datatype ParseState.whichCharacters (* This | Any *) + + local + fun header (instream, (file, sides)) = + case inputLineButDiscardNewline instream of + NONE => File.invalidFormat file + | SOME header => ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , ( + file + , File.startRead ( + file + , header + ), sides + ) + ) + fun space (instream, (file, sides)) = ParseState.enter ( + beforeHeaderMiddleOfLine + , instream + , (file, sides) + ) + fun newline (instream, (file, sides)) = ParseState.enter ( + beforeHeaderBeginningOfLine + , instream + , (file, sides) + ) + fun invalidFormat (_, (file, _)) = File.invalidFormat file + in + val () = ParseState.build { + state = beforeHeaderBeginningOfLine + , characters = [ + (These [#">"], header) + , (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, invalidFormat) + ], endOfFile = invalidFormat + } + val () = ParseState.build { + state = beforeHeaderMiddleOfLine + , characters = [ + (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, invalidFormat) + ], endOfFile = invalidFormat + } + end + local + fun base base (instream, (file, read, sides)) = ( + Sides.put (sides, base) + ; + if Sides.isFull sides then + Sides.process (file, read, sides) + else () + ; ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun space (instream, (file, read, sides)) = ( + ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun other (instream, (file, read, sides)) = ( + Sides.clear sides + ; ParseState.enter ( + afterHeaderMiddleOfLine + , instream + , (file, read, sides) + ) + ) + fun newline (instream, (file, read, sides)) = + ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , (file, read, sides) + ) + fun header (instream, (file, read, sides)) = ( + File.stopRead (file, read) + ; Sides.clear sides + ; case inputLineButDiscardNewline instream of + NONE => File.invalidFormat file + | SOME header => ParseState.enter ( + afterHeaderBeginningOfLine + , instream + , ( + file + , File.startRead ( + file + , header + ), sides + ) + ) + ) + fun success (_, (file, read, _)) = ( + File.stopRead (file, read) + ; File.stopFile file + ) + in + val () = ParseState.build { + state = afterHeaderBeginningOfLine + , characters = [ + (These [#"A", #"a"], base Nmer.a) + , (These [#"C", #"c"], base Nmer.c) + , (These [#"G", #"g"], base Nmer.g) + , (These [#"T", #"t"], base Nmer.t) + , (These [#">"], header) + , (These [#"\n"], newline) + , (These [#" ", #"\t", #"\r"], space) + , (Any, other) + ], endOfFile = success + } + val () = ParseState.build { + state = afterHeaderMiddleOfLine + , characters = [ + (These [#"A", #"a"], base Nmer.a) + , (These [#"C", #"c"], base Nmer.c) + , (These [#"G", #"g"], base Nmer.g) + , (These [#"T", #"t"], base Nmer.t) + , (These [#" ", #"\t", #"\r"], space) + , (These [#"\n"], newline) + , (Any, other) + ], endOfFile = success + } + end + fun process (argument, instream) = ParseState.enter ( + beforeHeaderBeginningOfLine + , instream + , (File.startFile argument, Sides.create ()) + ) +end + +functor SingleSidedFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer +) = AgnosticFasta ( + structure Nmer = Nmer + structure File = File + structure Sides = struct + type read = File.read + type file = File.file + open Nmer.Single + fun process (file, read, sides) = + File.nmer (file, read, forward sides) + end +) + +functor DoubleSidedFasta ( + structure Nmer: NMER + structure File: FILE + sharing type Nmer.nmer = File.nmer +) = AgnosticFasta ( + structure Nmer = Nmer + structure File = File + structure Sides = struct + type read = File.read + type file = File.file + open Nmer.Double + fun process (file, read, sides) = ( + File.nmer (file, read, forward sides) + ; File.nmer (file, read, reverse sides) + ) + end +) + +functor TestFile (Nmer: NMER) = struct + type argument = unit + type nmer = Nmer.nmer + type read = {header: string, nmers: nmer list ref} + type file = {header: string, nmers: string list} list ref + type result = string + fun startFile () = ref nil + fun stopFile file = String.concatWith ";" ( + map (fn {header, nmers} => + header + ^ ":" + ^ String.concatWith "," (rev nmers) + ) (rev (!file)) + ) + fun startRead (_, header) = + {header = header, nmers = ref nil} + fun nmer (_, {header = _, nmers}, nmer) = + nmers := nmer :: !nmers + fun stopRead (file, {header, nmers = ref nmers}) = + file := { + header = header + , nmers = map Nmer.toString nmers + } :: !file + fun invalidFormat _ = "invalid format" +end + +functor Test () = struct + structure Nmer1 = Nmer ( + val order = 1 + structure Word = Word32 + ) + structure File1 = TestFile (Nmer1) + structure SingleFasta1 = SingleSidedFasta ( + structure Nmer = Nmer1 + structure File = File1 + ) + fun test process input () = process ((), TextIO.openString input) + val single1 = test SingleFasta1.process + structure Nmer2 = Nmer ( + val order = 2 + structure Word = Word32 + ) + structure File2 = TestFile (Nmer2) + structure SingleFasta2 = SingleSidedFasta ( + structure Nmer = Nmer2 + structure File = File2 + ) + val single2 = test SingleFasta2.process + structure DoubleFasta1 = DoubleSidedFasta ( + structure Nmer = Nmer1 + structure File = File1 + ) + val double1 = test DoubleFasta1.process + structure DoubleFasta2 = DoubleSidedFasta ( + structure Nmer = Nmer2 + structure File = File2 + ) + val double2 = test DoubleFasta2.process + val () = Test.list [ + { + description = "single 1: A" + , function = single1 ">foo\nA\n" + , expectedResult = "foo:A" + }, { + description = "single 1: AG" + , function = single1 ">foo\nAG\n" + , expectedResult = "foo:A,G" + }, { + description = "single 2: A" + , function = single2 ">foo\nA\n" + , expectedResult = "foo:" + }, { + description = "single 2: CTGAG" + , function = single2 ">foo\nCTGAG\n" + , expectedResult = "foo:CT,TG,GA,AG" + }, { + description = "double 1: C" + , function = double1 ">bar\nC\n" + , expectedResult = "bar:C,G" + }, { + description = "double 2: T" + , function = double2 ">baz\nT\n" + , expectedResult = "baz:" + }, { + description = "double 2: GC" + , function = double2 ">quux\nGC\n" + , expectedResult = "quux:GC,GC" + }, { + description = "double 2: CCC\\nC\\nCT" + , function = double2 ">goo\nCCC\nC\nCT\n" + , expectedResult = "goo:CC,GG,CC,GG,CC,GG,CC,GG,CT,AG" + }, { + description = "double 2: CC\\nC*\\nT" + , function = double2 ">goo\nCC\nC*\nT\n" + , expectedResult = "goo:CC,GG,CC,GG" + }, { + description = "double 2: foo CATGAC goo TACCAG" + , function = double2 + ">foo\nCATGAC\n>goo\nTACCAG\n" + , expectedResult = ( + "foo:CA,TG,AT,AT,TG,CA,GA,TC,AC,GT" + ^ ";goo:TA,TA,AC,GT,CC,GG,CA,TG,AG,CT" + ) + } + ] +end diff --git a/src/nbc/gene.mlb b/src/nbc/gene.mlb new file mode 100644 index 0000000..47261ff --- /dev/null +++ b/src/nbc/gene.mlb @@ -0,0 +1,5 @@ +local + $(SML_LIB)/basis/basis.mlb +in + gene.sml +end diff --git a/src/nbc/gene.sml b/src/nbc/gene.sml new file mode 100644 index 0000000..738e3fe --- /dev/null +++ b/src/nbc/gene.sml @@ -0,0 +1,49 @@ +signature GENE = sig + val reverse: string -> string + val first: int -> string + val next: string -> string option +end + +structure Gene :> GENE = struct + fun reverse s = + let + val n = size s + val m = n - 1 + fun opposite c = case c of + #"A" => #"T" + | #"T" => #"A" + | #"C" => #"G" + | #"G" => #"C" + | _ => c + in + CharVector.tabulate (n, fn i => opposite (String.sub (s, m - i))) + end + fun first order = CharVector.tabulate (order, fn _ => #"A") + fun next nmer = + let + val order = size nmer + fun finish (rightmostNonT, replacement) = CharVector.tabulate ( + order + , fn index => + case + Int.compare ( + index + , rightmostNonT + ) + of + LESS => String.sub (nmer, index) + | EQUAL => replacement + | GREATER => #"A" + ) + fun continue index = + if index < 0 then NONE + else case String.sub (nmer, index) of + #"A" => SOME (finish (index, #"C")) + | #"C" => SOME (finish (index, #"G")) + | #"G" => SOME (finish (index, #"T")) + | #"T" => continue (index - 1) + | _ => raise Fail "Invalid base" + in + continue (size nmer - 1) + end +end diff --git a/src/nbc/genome.sml b/src/nbc/genome.sml new file mode 100644 index 0000000..81f3183 --- /dev/null +++ b/src/nbc/genome.sml @@ -0,0 +1,33 @@ +signature GENOME = sig + exception Bad + type t + val load: string * int -> t + val get: t * string -> int option +end + +structure Genome :> GENOME = struct + exception Bad + fun |> (x, f) = f x + infix |> + + type t = (string, int) HashTable.hash_table + fun load (gname, order) = + let + val h = HashTable.mkTable + (HashString.hashString, op =) + (1024 * 1024, Fail "") + in + Options.genomeText (order, gname) |> Gzip.openIn |> Misc.sequenceLines + |> Sequence.map (fn s => ( + case Misc.split2 s of + SOME (count, nmer) => ( + nmer + , case Int.fromString count of + NONE => raise Bad + | SOME x => x + ) | NONE => raise Bad + )) |> Sequence.app (HashTable.insert h) + ; h + end + fun get (h, nmer) = HashTable.find h nmer +end diff --git a/src/nbc/gzip.c b/src/nbc/gzip.c new file mode 100644 index 0000000..d2bf98f --- /dev/null +++ b/src/nbc/gzip.c @@ -0,0 +1,13 @@ +#include + +int +gzreadoffset(gzFile file, voidp buf, unsigned offset, unsigned len) +{ + return gzread(file, buf + offset, len); +} + +int +gzwriteoffset(gzFile file, voidpc buf, unsigned offset, unsigned len) +{ + return gzwrite(file, buf + offset, len); +} diff --git a/src/nbc/gzip.sml b/src/nbc/gzip.sml new file mode 100644 index 0000000..fc6e5ff --- /dev/null +++ b/src/nbc/gzip.sml @@ -0,0 +1,164 @@ +signature GZIP = sig + exception Failure + val openIn: string -> TextIO.instream + val openOut: string -> TextIO.outstream +end + +structure Gzip :> GZIP = struct + structure Primitive :> sig + eqtype gzFile + val null: gzFile + val gzopen: string * string -> gzFile + val gzread: gzFile * CharArray.array * word * word -> int + val gzwritea: gzFile * CharArray.array * word * word -> int + val gzwritev: gzFile * CharVector.vector * word * word -> int + val gzclose: gzFile -> int + end = struct + type gzFile = MLton.Pointer.t + val null = MLton.Pointer.null + val gzopen = _import "gzopen": string * string -> gzFile; + val gzread = _import "gzreadoffset": + gzFile * CharArray.array * word * word -> int; + val gzwritea = _import "gzwriteoffset": + gzFile * CharArray.array * word * word -> int; + val gzwritev = _import "gzwriteoffset": + gzFile * CharVector.vector * word * word -> int; + val gzclose = _import "gzclose": gzFile -> int; + end + + exception Failure + fun readArraySlice (g, slice) = + let + val (array, offset, length) = CharArraySlice.base slice + val wordoffset = Word.fromInt offset + val wordlength = Word.fromInt length + in + Primitive.gzread (g, array, wordoffset, wordlength) + end + fun readerFromPrimitive (name, g) = + let + val closed = ref false + fun error x = raise IO.Io { + name = name + , function = "readArr" + , cause = x + } + fun readArr slice = + if !closed then error IO.ClosedStream + else let + val r = readArraySlice (g, slice) + in + if r < 0 then error Failure + else r + end + fun close () = + if !closed then () + else ( + closed := true + ; ignore (Primitive.gzclose g) + ) + in + TextPrimIO.augmentReader (TextPrimIO.RD { + name = name + , chunkSize = 32 * 1024 + , readVec = NONE + , readArr = SOME readArr + , readVecNB = NONE + , readArrNB = NONE + , block = NONE + , canInput = NONE + , avail = fn () => NONE + , getPos = NONE + , setPos = NONE + , endPos = NONE + , verifyPos = NONE + , close = close + , ioDesc = NONE + }) + end + fun readerFromName name = + let + val path = name ^ "\000" + val g = Primitive.gzopen (path, "r\000") + in + if g = Primitive.null then NONE + else SOME (readerFromPrimitive (name, g)) + end + fun openIn name = + case readerFromName name of + SOME x => TextIO.mkInstream (TextIO.StreamIO.mkInstream (x, "")) + | NONE => raise IO.Io { + name = name + , function = "openIn" + , cause = Failure + } + fun writeSlice (base, write) (g, slice) = + let + val (vector, offset, length) = base slice + val wordoffset = Word.fromInt offset + val wordlength = Word.fromInt length + in + write (g, vector, wordoffset, wordlength) + end + val writeVectorSlice = writeSlice (CharVectorSlice.base, Primitive.gzwritev) + val writeArraySlice = writeSlice (CharArraySlice.base, Primitive.gzwritea) + fun writerFromPrimitive (name, g) = + let + val closed = ref false + fun error (function, x) = raise IO.Io { + name = name + , function = function + , cause = x + } + fun close () = + if !closed then () + else ( + closed := true + ; ignore (Primitive.gzclose g) + ) + fun write (name, realWrite) slice = + if !closed then error (name, IO.ClosedStream) + else let + val r = realWrite (g, slice) + in + if r <= 0 then error (name, Failure) + else r + end + val writeVec = write ("writeVec", writeVectorSlice) + val writeArr = write ("writeArr", writeArraySlice) + in + TextPrimIO.augmentWriter (TextPrimIO.WR { + name = name + , chunkSize = 32 * 1024 + , writeVec = SOME writeVec + , writeArr = SOME writeArr + , writeVecNB = NONE + , writeArrNB = NONE + , block = NONE + , canOutput = NONE + , getPos = NONE + , setPos = NONE + , endPos = NONE + , verifyPos = NONE + , close = close + , ioDesc = NONE + }) + end + fun writerFromName name = + let + val path = name ^ "\000" + val g = Primitive.gzopen (path, "w9") + in + if g = Primitive.null then NONE + else SOME (writerFromPrimitive (name, g)) + end + fun openOut name = + case writerFromName name of + SOME x => TextIO.mkOutstream (TextIO.StreamIO.mkOutstream (x, IO.NO_BUF)) + | NONE => raise IO.Io { + name = name + , function = "openOut" + , cause = Failure + } +end + diff --git a/src/nbc/history.mlb b/src/nbc/history.mlb new file mode 100644 index 0000000..e7103cc --- /dev/null +++ b/src/nbc/history.mlb @@ -0,0 +1,5 @@ +local + $(SML_LIB)/basis/basis.mlb +in + history.sml +end diff --git a/src/nbc/history.sml b/src/nbc/history.sml new file mode 100644 index 0000000..52c1bb2 --- /dev/null +++ b/src/nbc/history.sml @@ -0,0 +1,74 @@ +structure History :> sig + type history + val create: int -> history + val clear: history -> history + val push: history * char -> string option * history +end = struct + type history = {maximumSize: int, content: string} + fun create maximumSize = {maximumSize = maximumSize, content = ""} + fun clear {maximumSize, content = _} = + {maximumSize = maximumSize, content = ""} + fun addToString (string, newCharacter) = + let + val size = size string + in + CharVector.tabulate ( + size + 1 + , fn index => + if index = size then newCharacter + else String.sub (string, index) + ) + end + fun shiftIntoString (string, newCharacter) = + let + val size = size string + in + CharVector.tabulate ( + size + , fn index => + if index = size - 1 then newCharacter + else String.sub (string, index + 1) + ) + end + fun push ({maximumSize, content}, newCharacter) = + let + val currentSize = size content + in + if currentSize = maximumSize then + let + val newContent = shiftIntoString ( + content + , newCharacter + ) + in ( + SOME newContent + , { + maximumSize = maximumSize + , content = newContent + } + ) end + else if currentSize = maximumSize - 1 then + let + val newContent = addToString ( + content + , newCharacter + ) + in ( + SOME newContent + , { + maximumSize = maximumSize + , content = newContent + } + ) end + else ( + NONE + , { + maximumSize = maximumSize + , content = addToString ( + content + , newCharacter + ) + } + ) + end +end diff --git a/src/nbc/judy.sml b/src/nbc/judy.sml new file mode 100644 index 0000000..73e257b --- /dev/null +++ b/src/nbc/judy.sml @@ -0,0 +1,175 @@ +signature JUDY = sig + exception OutOfMemory + type t + val create: unit -> t + val insert: t * string * int -> unit + val get: t * string -> int option + val bump: t * string -> unit + val first: t -> (string * int) option + val next: t * string -> (string * int) option + val sequence: t -> (string * int) Sequence.t + val app: (string * int -> unit) -> t -> unit +end +structure Judy :> JUDY = struct + structure Primitive :> sig + type judy + type errorDetail + type return + val judyNull: judy + val errorDetailNull: errorDetail + val get: judy * string * errorDetail -> return + val insert: judy ref * string * errorDetail -> return + val delete: judy ref * string * errorDetail -> int + val free: judy ref * errorDetail -> word + val first: judy * CharArray.array * errorDetail -> return + val next: judy * CharArray.array * errorDetail -> return + val returnIsError: return -> bool + val returnIsNull: return -> bool + val returnGet: return -> int + val returnSet: return * int -> unit + end = struct + type judy = MLton.Pointer.t + type errorDetail = MLton.Pointer.t + type return = MLton.Pointer.t + val judyNull = MLton.Pointer.null + val errorDetailNull = MLton.Pointer.null + val get = _import "JudySLGet": judy * string * errorDetail -> return; + val insert = _import "JudySLIns": judy ref * string * errorDetail -> return; + val delete = _import "JudySLDel": judy ref * string * errorDetail -> int; + val free = _import "JudySLFreeArray": judy ref * errorDetail -> word; + val first = _import "JudySLFirst": judy * CharArray.array * errorDetail -> return; + val next = _import "JudySLNext": judy * CharArray.array * errorDetail -> return; + local + val pjerr = MLton.Pointer.sub (MLton.Pointer.null, 0w1) + in + fun returnIsError return = return = pjerr + end + fun returnIsNull return = return = MLton.Pointer.null + fun returnGet return = Int32.toInt (MLton.Pointer.getInt32 (return, 0)) + fun returnSet (return, i) = MLton.Pointer.setInt32 (return, 0, Int32.fromInt i) + end + exception OutOfMemory + type t = {judy: Primitive.judy ref, max: int ref} + fun create () = {judy = ref Primitive.judyNull, max = ref 0} + fun insert ({judy, max}, key, value) = + let + val return = Primitive.insert ( + judy + , key ^ "\000" + , Primitive.errorDetailNull + ) + in + if Primitive.returnIsError return then raise OutOfMemory + else let + val n = size key + in + if !max < n then max := n else () + ; Primitive.returnSet (return, value) + end + end + fun get ({judy, max = _}, key) = + let + val return = Primitive.get ( + !judy + , key ^ "\000" + , Primitive.errorDetailNull + ) + in + if Primitive.returnIsNull return then NONE + else SOME (Primitive.returnGet return) + end + fun bump ({judy, max}, key) = + let + val return = Primitive.insert ( + judy + , key ^ "\000" + , Primitive.errorDetailNull + ) + in + if Primitive.returnIsError return then raise OutOfMemory + else let + val n = size key + in + if !max < n then max := n else () + ; Primitive.returnSet ( + return + , Primitive.returnGet return + 1 + ) + end + end + fun strlen array = + case CharArray.findi (fn (_, c) => c = #"\000") array of + NONE => raise Option + | SOME (i, _) => i + fun stringFromNullTerminatedArray array = + CharArraySlice.vector ( + CharArraySlice.slice (array, 0, SOME (strlen array)) + ) + fun first {judy, max} = + let + val array = CharArray.array (!max + 1, #"\000") + val return = Primitive.first ( + !judy + , array + , Primitive.errorDetailNull + ) + in + if Primitive.returnIsNull return then NONE + else SOME ( + stringFromNullTerminatedArray array + , Primitive.returnGet return + ) + end + fun next ({judy, max}, key) = + let + val size = size key + val array = CharArray.tabulate ( + !max + 1 + , fn i => + if i < size then String.sub (key, i) + else #"\000" + ) + val return = Primitive.next ( + !judy + , array + , Primitive.errorDetailNull + ) + in + if Primitive.returnIsNull return then NONE + else SOME ( + stringFromNullTerminatedArray array + , Primitive.returnGet return + ) + end + fun sequence t = + let + val last = ref NONE + fun get () = + case ( + case !last of + NONE => first t + | SOME key => next (t, key) + ) of + NONE => NONE + | SOME (return as (key, _)) => ( + last := SOME key + ; SOME return + ) + in + Sequence.from get + end + fun app f t = + let + fun apply (key, value) = ( + f (key, value) + ; fetch key + ) and fetch key = + case next (t, key) of + NONE => () + | SOME x => apply x + in + case first t of + NONE => () + | SOME x => apply x + end +end diff --git a/src/nbc/kahan.sml b/src/nbc/kahan.sml new file mode 100644 index 0000000..70c6b47 --- /dev/null +++ b/src/nbc/kahan.sml @@ -0,0 +1,31 @@ +(* Kahan summation *) + +signature KAHAN = sig + type t + val zero: t + val add: t * real -> t + val sum: t -> real + val sequence: real Sequence.t -> real + val list: real list -> real + val array: real array -> real +end + +structure Kahan :> KAHAN = struct + type t = real * real + val zero = (0.0, 0.0) + fun add ((s, c), x) = + let + val y = x - c + val t = s + y + in + (t, t - s - y) + end + fun sum (s, c) = s + local + fun swappedAdd (a, b) = add (b, a) + in + fun sequence e = sum (Sequence.fold swappedAdd zero e) + fun list l = sum (foldl swappedAdd zero l) + fun array a = sum (Array.foldl swappedAdd zero a) + end +end diff --git a/src/nbc/main.sml b/src/nbc/main.sml new file mode 100644 index 0000000..361d9ca --- /dev/null +++ b/src/nbc/main.sml @@ -0,0 +1,98 @@ +structure Main = struct + +fun |> (x, f) = f x +infix |> + +val order = Options.order +val genomes = case Options.genomes of + SOME x => x |> TextIO.openIn |> Misc.sequenceLines + | NONE => Options.genomesDir |> Misc.sortedDirectoryNoPrefix |> Sequence.fromList +fun input () = + case Fasta.sequence (Gzip.openIn Options.input) of + NONE => Fail.fail "input file is not FASTA format" + | SOME x => + Sequence.map (fn (header, data) => + (header, String.map Char.toUpper data) + ) x +fun output genome = + let + fun inner format = case format of + Options.Matlab {variable, file} => + let + val matlab = Matlab.openOut file + val doubleArray = Matlab.beginDoubleArray (matlab, variable) + in { + write = fn (_, score) => + Matlab.writeDouble (doubleArray, score) + , close = fn () => ( + Matlab.concludeDoubleArray doubleArray + ; Matlab.closeOut matlab + ) + } end + | Options.Text filename => + let + val gzip = Gzip.openOut filename + in { + write = fn (header, score) => + TextIO.output ( + gzip + , Real.fmt (StringCvt.FIX (SOME 8)) score + ) + , close = fn () => TextIO.closeOut gzip + } end + | Options.Dual {text, matlab} => + let + val text = inner (Options.Text text) + val matlab = inner (Options.Matlab matlab) + in { + write = fn x => ( + #write text x + ; #write matlab x + ), close = fn () => ( + #close text () + ; #close matlab () + ) + } end + in + inner (Options.output genome) + end +fun totalWords (genome, order) = + let + val name = Options.totalWords (genome, order) + val input = TextIO.openIn name + in + (case Option.mapPartial Real.fromString (Misc.inputLine input) of + NONE => Fail.fail ("could not read number from " ^ name) + | SOME r => r + ) before TextIO.closeIn input + end + +val () = + Sequence.app (fn gname => + let + val {write, close} = output gname + val totalWords = totalWords (gname, order) + val genome = ( + Stopwatch.start ("Loading genome " ^ gname) + ; Genome.load (gname, order) + before Stopwatch.finish () + ) + in + Stopwatch.start ("Scoring fragments") + ; input () |> Sequence.app (fn (header, fragment) => + write ( + header + , Score.score ( + order + , Options.missConstant + , fn nmer => Genome.get (genome, nmer) + , totalWords + , fragment + ) + ) + ); Stopwatch.finish () + ; close () + end + ) genomes + +end diff --git a/src/nbc/matlab.sml b/src/nbc/matlab.sml new file mode 100644 index 0000000..65f532d --- /dev/null +++ b/src/nbc/matlab.sml @@ -0,0 +1,135 @@ +signature MATLAB = sig + type t + val openOut: string -> t + type doubleArray + val beginDoubleArray: t * string -> doubleArray + val writeDouble: doubleArray * real -> unit + val concludeDoubleArray: doubleArray -> unit + val writeDoubleArray: t * string * real vector -> unit + val closeOut: t -> unit +end + +structure Matlab :> MATLAB = struct + fun outputReal (c, f) = BinIO.output (c, Binary.fromReal f) + fun outputInt32 (c, i) = BinIO.output (c, Binary.fromInt32 i) + fun outputInt16 (c, i) = BinIO.output (c, Binary.fromInt16 i) + type t = BinIO.outstream + type doubleArray = { + outstream: BinIO.outstream + , wholeSizePos: BinIO.StreamIO.out_pos + , paddedSize: int + , arraySizePos: BinIO.StreamIO.out_pos + , dataSizePos: BinIO.StreamIO.out_pos + , elements: int ref + } + fun pad n = + if n > 0 andalso n <= 4 then 4 + else (n + 7) div 8 * 8 + fun doubleArraySize (nameLength, arrayLength) = + 32 + nameLength + 4 + 8 + arrayLength * 8 + val s8Tag = 1 + val s32Tag = 5 + val u32Tag = 6 + val doubleTag = 9 + val doubleFlag = 6 + val matrixTag = 14 + fun lsl (x, y) = Word.toIntX (Word.<< (Word.fromInt x, Word.fromInt y)) + fun beginDoubleArray (c, name) = + let + val nameSize = Int.min (size name, 63) + val name = Word8VectorSlice.vector ( + Word8VectorSlice.slice (Byte.stringToBytes name, 0, SOME nameSize) + ) + val paddedSize = pad nameSize + val padding = Word8Vector.tabulate (paddedSize - nameSize, fn _ => 0w0) + val () = outputInt32 (c, matrixTag) + val wholeSizePos = BinIO.getPosOut c + val () = ( + outputInt32 (c, 0) + ; outputInt32 (c, u32Tag) + ; outputInt32 (c, 8) + ; outputInt32 (c, doubleFlag) + ; outputInt32 (c, 0) + ; outputInt32 (c, s32Tag) + ; outputInt32 (c, 8) + ; outputInt32 (c, 1) + ) + val arraySizePos = BinIO.getPosOut c + val () = ( + outputInt32 (c, 0) + ; if nameSize > 0 andalso nameSize <= 4 then + outputInt32 (c, lsl (nameSize, 16) + s8Tag) + else ( + outputInt32 (c, s8Tag) + ; outputInt32 (c, nameSize) + ); BinIO.output (c, name) + ; BinIO.output (c, padding) + ; outputInt32 (c, doubleTag) + ) + val dataSizePos = BinIO.getPosOut c + val () = outputInt32 (c, 0) + in + { + outstream = c + , wholeSizePos = wholeSizePos + , paddedSize = paddedSize + , arraySizePos = arraySizePos + , dataSizePos = dataSizePos + , elements = ref 0 + } + end + fun writeDouble (da, f) = ( + outputReal (#outstream da, f) + ; #elements da := !(#elements da) + 1 + ) + fun concludeDoubleArray da = + let + val saved = BinIO.getPosOut (#outstream da) + in + BinIO.setPosOut (#outstream da, #wholeSizePos da) + ; outputInt32 ( + #outstream da + , doubleArraySize (#paddedSize da, !(#elements da)) + ); BinIO.setPosOut (#outstream da, #arraySizePos da) + ; outputInt32 (#outstream da, !(#elements da)) + ; BinIO.setPosOut (#outstream da, #dataSizePos da) + ; outputInt32 (#outstream da, (!(#elements da) * 8)) + ; BinIO.setPosOut (#outstream da, saved) + end + fun writeDoubleArray (c, name, array) = + let + val da = beginDoubleArray (c, name) + in + Vector.app (fn x => writeDouble (da, x)) array + ; concludeDoubleArray da + end + fun writeHeader (c, software, version) = + let + val date = Date.fromTimeUniv (Time.now ()) + val text = concat [ + "MATLAB 5.0 MAT-file, written by " + , software, " ", version, ", " + , Date.fmt "%Y-%m-$d %H:%M:%S UTC" date + ] + val size = size text + val header = + CharVector.tabulate ( + 124 + , fn i => + if i < size then String.sub (text, i) + else #" " + ) + in + BinIO.output (c, Byte.stringToBytes header) + ; outputInt16 (c, 0x0100) + ; outputInt16 (c, 0x4d49) + end + fun openOut name = + let + val c = BinIO.openOut name + in + writeHeader (c, Program.name, Program.version) + ; c + end + val closeOut = BinIO.closeOut +end diff --git a/src/nbc/misc.sml b/src/nbc/misc.sml new file mode 100644 index 0000000..4790bbc --- /dev/null +++ b/src/nbc/misc.sml @@ -0,0 +1,125 @@ +signature MISC = sig + val inputLine: TextIO.instream -> string option + val sequenceFromRead: (TextIO.instream -> 'a option) -> TextIO.instream -> 'a Sequence.t + val sequenceLines: TextIO.instream -> string Sequence.t + val sortedDirectoryNoPrefix: string -> string list + val sortedDirectory: string -> string list + val substitute: (string * string) list -> string -> string option + val basenameWithoutExtension: string -> string + val split2: string -> (string * string) option + val longestCommonSubstring: string list -> string +end + +structure Misc :> MISC = struct + fun |> (x, f) = f x + infix |> + fun \ f x y = f (x, y) + fun sequenceFromRead f ioc = Sequence.from (fn () => f ioc) + fun inputLine instream = case TextIO.inputLine instream of + NONE => NONE + | SOME x => SOME (String.substring (x, 0, size x - 1)) + val sequenceLines = sequenceFromRead inputLine + fun sortedDirectoryNoPrefix d = + let + val h = OS.FileSys.openDir d + fun loop l = + case OS.FileSys.readDir h of + NONE => ( + OS.FileSys.closeDir h + ; ListMergeSort.sort (op >) l + ) | SOME n => loop ( + if String.sub (n, 0) = #"." then l + else n :: l + ) + in + loop nil + end + fun sortedDirectory d = + map (\OS.Path.concat d) (sortedDirectoryNoPrefix d) + fun assoc list key = + case + List.find (fn (possibility, _) => possibility = key) list + of + NONE => NONE + | SOME (_, answer) => SOME answer + local + exception NotFound + in + fun substitute v s = + Substitution.substitute + (fn s => (case assoc v s of + NONE => raise NotFound + | SOME x => x + )) s + handle NotFound => NONE + end + fun basenameWithoutExtension s = OS.Path.base (OS.Path.file s) + local + fun index f (string, offset) = + let + val last = size string + fun loop i = + if i = last then ~1 + else if f (String.sub (string, i)) then i + else loop (i + 1) + in + loop offset + end + in + val indexWhitespace = index Char.isSpace + val indexNonWhitespace = index (not o Char.isSpace) + end + fun split2 s = + let + val f1b = indexNonWhitespace (s, 0) + in + if f1b = ~1 then NONE + else let + val f1e = indexWhitespace (s, f1b + 1) + in + if f1e = ~1 then NONE + else let + val f2b = indexNonWhitespace (s, f1e + 1) + in + if f2b = ~1 then NONE + else let + val f2e = indexWhitespace (s, f2b + 1) + in + if f2e = ~1 then SOME ( + substring (s, f1b, f1e - f1b) + , substring (s, f2b, size s - f2b) + ) else if indexNonWhitespace (s, f2e + 1) = ~1 then + SOME ( + substring (s, f1b, f1e - f1b) + , substring (s, f2b, f2e - f2b) + ) + else NONE + end + end + end + end + fun longestCommonSubstring strings = + case + foldl (fn (a, b) => (case b of + NONE => SOME a + | SOME b => if size a < size b then SOME a else SOME b + )) NONE strings + of + NONE => "" + | SOME shortest => let + val minimumSize = size shortest + fun loop (size, offset) = + let + fun next () = + if size + offset = minimumSize then + loop (size - 1, 0) + else loop (size, offset + 1) + val sub = substring (shortest, offset, size) + in + if List.all (String.isSubstring sub) strings then sub + else next () + end + in + loop (minimumSize, 0) + end +end diff --git a/src/nbc/nmer-all.mlb b/src/nbc/nmer-all.mlb new file mode 100644 index 0000000..dcc3126 --- /dev/null +++ b/src/nbc/nmer-all.mlb @@ -0,0 +1,6 @@ +local + $(SML_LIB)/basis/basis.mlb + test-library.mlb +in + nmer.sml +end diff --git a/src/nbc/nmer-test.mlb b/src/nbc/nmer-test.mlb new file mode 100644 index 0000000..c363f4a --- /dev/null +++ b/src/nbc/nmer-test.mlb @@ -0,0 +1,9 @@ +local + local + nmer-all.mlb + in + functor Test + end +in + test-generate.sml +end diff --git a/src/nbc/nmer.mlb b/src/nbc/nmer.mlb new file mode 100644 index 0000000..a7f4b01 --- /dev/null +++ b/src/nbc/nmer.mlb @@ -0,0 +1,7 @@ +local + nmer-all.mlb +in + signature NMER + signature NMER_SIDES + functor Nmer +end diff --git a/src/nbc/nmer.sml b/src/nbc/nmer.sml new file mode 100644 index 0000000..82c5b53 --- /dev/null +++ b/src/nbc/nmer.sml @@ -0,0 +1,557 @@ +signature NMER_SIDES = sig + type sidesBase + type sidesNmer + type sides + val create: unit -> sides + val clear: sides -> unit + val put: sides * sidesBase -> unit + val isFull: sides -> bool + val forward: sides -> sidesNmer +end + +signature NMER = sig + eqtype base + val a: base + val c: base + val g: base + val t: base + eqtype nmer + val compare: nmer * nmer -> order + val maximum: nmer + val minimum: nmer + val next: nmer -> nmer + val hash: nmer -> Word.word + val equal: nmer * nmer -> bool + val toString: nmer -> string + val fromString: string -> nmer option + structure Single: NMER_SIDES + where type sidesBase = base + where type sidesNmer = nmer + structure Double: sig + include NMER_SIDES + where type sidesBase = base + where type sidesNmer = nmer + val reverse: sides -> nmer + end +end + +signature NMER_ARGUMENTS = sig + val order: int + structure Word: sig + eqtype word + val fromInt: Int.int -> word + val toInt: word -> Int.int + val + : word * word -> word + val << : word * Word.word -> word + val ~>> : word * Word.word -> word + val andb: word * word -> word + val orb: word * word -> word + val xorb: word * word -> word + val compare: word * word -> order + val toLarge: word -> LargeWord.word + end +end + +functor Nmer (Arguments: NMER_ARGUMENTS) = struct + type base = Arguments.Word.word + val a = Arguments.Word.fromInt 0 + val c = Arguments.Word.fromInt 1 + val g = Arguments.Word.fromInt 2 + val t = Arguments.Word.fromInt 3 + val maximumBase = t + val baseBits = 0w2 + val nmerBits = Word.fromInt (Arguments.order * 2) + fun opposite base = + (* + Conveniently enough, xor properly implements this: + a -> t + c -> g + g -> c + t -> a + *) + Arguments.Word.xorb (base, maximumBase) + type nmer = Arguments.Word.word + val compare = Arguments.Word.compare + val minimum = Arguments.Word.fromInt 0 + local + fun shiftInto (nmer, base) = + Arguments.Word.+ ( + Arguments.Word.<< (nmer, baseBits) + , base + ) + fun maximumOfOrder order = + if order = 0 then minimum + else shiftInto ( + maximumOfOrder (order - 1) + , maximumBase + ) + in + val maximum = maximumOfOrder Arguments.order + end + local + val one = Arguments.Word.fromInt 1 + in + fun next nmer = Arguments.Word.+ (nmer, one) + end + local + fun charFromBase base = case Arguments.Word.toInt base of + 0 => #"A" + | 1 => #"C" + | 2 => #"G" + | 3 => #"T" + | _ => raise Fail "bug in nmer.sml" + fun get (nmer, index) = + let + fun multiplyByTwo word = Word.<< (word, 0w1) + val offset = multiplyByTwo ( + Word.fromInt ( + Arguments.order - 1 - index + ) + ) + in + Arguments.Word.~>> ( + Arguments.Word.andb ( + nmer + , Arguments.Word.<< ( + maximumBase + , offset + ) + ), offset + ) + end + in + fun toString nmer = CharVector.tabulate ( + Arguments.order + , fn index => charFromBase ( + get (nmer, index) + ) + ) + end + fun hash nmer = Word.fromLarge (Arguments.Word.toLarge nmer) + fun equal (a, b) = Arguments.Word.compare (a, b) = EQUAL + structure Undetermined = struct + type sidesBase = base + type sidesNmer = nmer + type 'reverse undeterminedSides = { + forward: nmer ref + , reverse: 'reverse + , count: int ref + } + fun clear {forward = _, reverse = _, count} = count := 0 + fun put ({forward, reverse, count}, base) = ( + forward := Arguments.Word.+ ( + Arguments.Word.andb ( + Arguments.Word.<< ( + !forward + , baseBits + ), maximum + ), base + ); + if !count = Arguments.order then () + else count := !count + 1 + ) + fun isFull {forward = _, reverse = _, count = ref count} = + count = Arguments.order + fun forward {forward = ref forward, reverse = _, count = _} = + forward + end + structure Single = struct + open Undetermined + type sides = unit undeterminedSides + fun create () = { + forward = ref minimum + , reverse = () + , count = ref 0 + } + end + structure Double = struct + open Undetermined + type sides = nmer ref undeterminedSides + fun create () = { + forward = ref minimum + , reverse = ref maximum + , count = ref 0 + } + val put = fn ( + sides as {forward = _, reverse, count = _} + , base + ) => ( + put (sides, base) + ; reverse := Arguments.Word.+ ( + Arguments.Word.~>> ( + !reverse + , baseBits + ), Arguments.Word.<< ( + opposite base + , nmerBits - baseBits + ) + ) + ) + fun reverse {reverse = ref reverse, forward = _, count = _} = + reverse + end + fun fromString string = + let + val side = Single.create () + val char = fn + #"A" => (Single.put (side, a); true) + | #"a" => (Single.put (side, a); true) + | #"C" => (Single.put (side, c); true) + | #"c" => (Single.put (side, c); true) + | #"G" => (Single.put (side, g); true) + | #"g" => (Single.put (side, g); true) + | #"T" => (Single.put (side, t); true) + | #"t" => (Single.put (side, t); true) + | _ => false + + in + if CharVector.all char string then + SOME (Single.forward side) + else NONE + end +end + +functor Test () = struct + structure Nmer1 = Nmer ( + val order = 1 + structure Word = Word32 + ) + structure Nmer2 = Nmer ( + val order = 2 + structure Word = Word32 + ) + structure Nmer3 = Nmer ( + val order = 3 + structure Word = Word32 + ) + val () = Test.list [ + { + description = "opposite a = t" + , function = fn () => + Bool.toString ( + Nmer1.opposite Nmer1.a = Nmer1.t + ) + , expectedResult = "true" + }, { + description = "opposite c = g" + , function = fn () => + Bool.toString ( + Nmer1.opposite Nmer1.c = Nmer1.g + ) + , expectedResult = "true" + }, { + description = "A forward" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.a) + ; Nmer1.toString ( + Nmer1.Double.forward nmer + ) + end + , expectedResult = "A" + }, { + description = "A reverse" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.a) + ; Nmer1.toString ( + Nmer1.Double.reverse nmer + ) + end + , expectedResult = "T" + }, { + description = "C forward" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.c) + ; Nmer1.toString ( + Nmer1.Double.forward nmer + ) + end + , expectedResult = "C" + }, { + description = "C reverse" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.c) + ; Nmer1.toString ( + Nmer1.Double.reverse nmer + ) + end + , expectedResult = "G" + }, { + description = "G forward" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.g) + ; Nmer1.toString ( + Nmer1.Double.forward nmer + ) + end + , expectedResult = "G" + }, { + description = "G reverse" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.g) + ; Nmer1.toString ( + Nmer1.Double.reverse nmer + ) + end + , expectedResult = "C" + }, { + description = "T forward" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.t) + ; Nmer1.toString ( + Nmer1.Double.forward nmer + ) + end + , expectedResult = "T" + }, { + description = "T reverse" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.t) + ; Nmer1.toString ( + Nmer1.Double.reverse nmer + ) + end + , expectedResult = "A" + }, { + description = "AA forward" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.toString ( + Nmer2.Double.forward nmer + ) + end + , expectedResult = "AA" + }, { + description = "AA reverse" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.toString ( + Nmer2.Double.reverse nmer + ) + end + , expectedResult = "TT" + }, { + description = "AC forward" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.toString ( + Nmer2.Double.forward nmer + ) + end + , expectedResult = "AC" + }, { + description = "AC reverse" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.toString ( + Nmer2.Double.reverse nmer + ) + end + , expectedResult = "GT" + }, { + description = "GTA forward" + , function = fn () => + let + val nmer = Nmer3.Double.create () + in + Nmer3.Double.put (nmer, Nmer3.g) + ; Nmer3.Double.put (nmer, Nmer3.t) + ; Nmer3.Double.put (nmer, Nmer3.a) + ; Nmer3.toString ( + Nmer3.Double.forward nmer + ) + end + , expectedResult = "GTA" + }, { + description = "GTA reverse" + , function = fn () => + let + val nmer = Nmer3.Double.create () + in + Nmer3.Double.put (nmer, Nmer3.g) + ; Nmer3.Double.put (nmer, Nmer3.t) + ; Nmer3.Double.put (nmer, Nmer3.a) + ; Nmer3.toString ( + Nmer3.Double.reverse nmer + ) + end + , expectedResult = "TAC" + }, { + description = "( ) isFull" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Bool.toString ( + Nmer1.Double.isFull nmer + ) + end + , expectedResult = "false" + }, { + description = "(C) isFull" + , function = fn () => + let + val nmer = Nmer1.Double.create () + in + Nmer1.Double.put (nmer, Nmer1.g) + ; Bool.toString ( + Nmer1.Double.isFull nmer + ) + end + , expectedResult = "true" + }, { + description = "(C ) isFull" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Bool.toString ( + Nmer2.Double.isFull nmer + ) + end + , expectedResult = "false" + }, { + description = "(CG) isFull" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Bool.toString ( + Nmer2.Double.isFull nmer + ) + end + , expectedResult = "true" + }, { + description = "C(GA) isFull" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Bool.toString ( + Nmer2.Double.isFull nmer + ) + end + , expectedResult = "true" + }, { + description = "CGA( ) isFull" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.clear nmer + ; Bool.toString ( + Nmer2.Double.isFull nmer + ) + end + , expectedResult = "false" + }, { + description = "CGA (AC) isFull" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.clear nmer + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.c) + ; Bool.toString ( + Nmer2.Double.isFull nmer + ) + end + , expectedResult = "true" + }, { + description = "CGA (AC) forward" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.clear nmer + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.toString ( + Nmer2.Double.forward nmer + ) + end + , expectedResult = "AC" + }, { + description = "CGA (AC) reverse" + , function = fn () => + let + val nmer = Nmer2.Double.create () + in + Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.Double.put (nmer, Nmer2.g) + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.clear nmer + ; Nmer2.Double.put (nmer, Nmer2.a) + ; Nmer2.Double.put (nmer, Nmer2.c) + ; Nmer2.toString ( + Nmer2.Double.reverse nmer + ) + end + , expectedResult = "GT" + }, { + description = "TG fromString" + , function = fn () => + case Nmer2.fromString "TG" of + NONE => "invalid string" + | SOME nmer => Nmer2.toString nmer + , expectedResult = "TG" + } + ] +end + +functor Nmer (Arguments: NMER_ARGUMENTS) :> NMER = Nmer (Arguments) diff --git a/src/nbc/options.sml b/src/nbc/options.sml new file mode 100644 index 0000000..907c95a --- /dev/null +++ b/src/nbc/options.sml @@ -0,0 +1,202 @@ +signature OPTIONS = sig + val order: int + val input: string + val genomesDir: string + val genomes: string option + val genomeText: int * string -> string + val totalWords: string * int -> string + val missConstant: real + datatype output = + Matlab of {variable: string, file: string} + | Text of string + | Dual of {text: string, matlab: {variable: string, file: string}} + val output: string -> output + val arguments: string list +end + +structure Options :> OPTIONS = struct + fun |> (x, f) = f x + infix |> + datatype outformat = Otext | Omatlab | Odual + local + val order = ref 15 + val fasta = ref NONE + val genomesDir = ref "/var/lib/genomes" + val genomes = ref NONE + val genomeText = ref "$genomes_dir/$genome/${order}perword.gz" + val totalWords = ref "$genomes_dir/$genome/${order}total" + val missConstant = ref (1.0 / 26067530.0 / 1000000.0) + val outFormat = ref Otext + val variable = ref "ps_g" + val outFile = ref "$input-$order-$genome.$extension" + fun withDefault (description, default) = concat ( + description + :: " (" + :: (case default of + NONE => ["no default)"] + | SOME x => ["default: ", x, ")"]) + ) + val options = [ + { + short = "r", long = ["order"] + , help = withDefault + ("Set the length of words used in matches", NONE) + , desc = GetOpt.ReqArg ( + fn x => case Int.fromString x of + NONE => Fail.fail + "given order is not a valid integer" + | SOME y => order := y + , "integer" + ) + }, { + short = "a", long = ["fasta-input"] + , help = "FASTA input file" + , desc = GetOpt.ReqArg (fn x => fasta := SOME x, "filename") + }, { + short = "j", long = ["genomes-dir"] + , help = withDefault ( + "Set the directory where genomes are stored" + , SOME (!genomesDir) + ), desc = GetOpt.ReqArg (fn x => genomesDir := x, "directory") + }, { + short = "g", long = ["genome-list"] + , help = withDefault ( + "Read genomes from this file, one per line" + , !genomes + ), desc = GetOpt.ReqArg (fn x => genomes := SOME x, "filename") + }, { + short = "w", long = ["genome-text"] + , help = withDefault ( + "Filename to read per-word counts from" + , SOME (!genomeText) + ), desc = GetOpt.ReqArg (fn x => genomeText := x, "filename") + }, { + short = "t", long = ["total-words"] + , help = withDefault ( + "Filename to read total word counts from" + , SOME (!totalWords) + ), desc = GetOpt.ReqArg (fn x => totalWords := x, "filename") + }, { + short = "k", long = ["miss-constant"] + , help = withDefault ( + "Set the constant used for scoring missing words" + , SOME (Real.toString (!missConstant)) + ), desc = GetOpt.ReqArg ( + fn x => (case Real.fromString x of + SOME y => missConstant := y + | NONE => Fail.fail + "given miss constant is not a valid number" + ), "number" + ) + }, { + short = "x", long = ["text"] + , help = "Select gzipped text output (default)" + , desc = GetOpt.NoArg (fn () => outFormat := Otext) + }, { + short = "m", long = ["matlab"] + , help = "Select Matlab binary output" + , desc = GetOpt.NoArg (fn () => outFormat := Omatlab) + }, { + short = "u", long = ["dual"] + , help = "Select both gzipped text and Matlab binary output" + , desc = GetOpt.NoArg (fn () => outFormat := Odual) + }, { + short = "v", long = ["variable"] + , help = withDefault ( + "Set Matlab output variable" + , SOME (!variable) + ), desc = GetOpt.ReqArg (fn x => variable := x, "name") + }, { + short = "o", long = ["output-file"] + , help = withDefault ( + "Set output filename(s)" + , SOME (!outFile) + ), desc = GetOpt.ReqArg (fn x => outFile := x, "filename") + } + ] + val optionsWithHelp = { + short = "h", long = ["help"] + , help = "Display help" + , desc = GetOpt.NoArg (fn () => ( + print ( + GetOpt.usageInfo { + header = "score " + , options = options + } ^ "\n" + ); OS.Process.exit OS.Process.success + )) + } :: options + in + val (_, arguments) = GetOpt.getOpt { + argOrder = GetOpt.Permute + , options = optionsWithHelp + , errFn = print + } (CommandLine.arguments ()) + val order = !order + val input = case !fasta of + NONE => Fail.fail "No input file given" + | SOME x => x + val genomesDir = !genomesDir + val genomes = !genomes + val genomeText = fn (order, genome) => ( + case + Misc.substitute [ + ("genomes_dir", genomesDir) + , ("genome", genome) + , ("order", Int.toString order) + ] (!genomeText) + of + NONE => Fail.fail "bad per-word count filename syntax" + | SOME x => x + ) + val totalWords = fn (genome, order) => ( + case + Misc.substitute [ + ("genomes_dir", genomesDir) + , ("genome", genome) + , ("order", Int.toString order) + ] (!totalWords) + of + NONE => Fail.fail "bad total word count filename syntax" + | SOME x => x + ) + val missConstant = !missConstant + datatype output = + Matlab of {variable: string, file: string} + | Text of string + | Dual of {text: string, matlab: {variable: string, file: string}} + fun output genome = + let + val common = [ + ("input", Misc.basenameWithoutExtension input) + , ("order", Int.toString order) + , ("genome", genome) + ] + fun textName () = + case Misc.substitute + (("extension", "txt.gz") :: common) (!outFile) + of + NONE => Fail.fail "bad text output filename syntax" + | SOME x => x + fun matlabName () = + case Misc.substitute + (("extension", "mat") :: common) (!outFile) + of + NONE => Fail.fail "bad Matlab output filename syntax" + | SOME x => x + in + case !outFormat of + Otext => Text (textName ()) + | Omatlab => Matlab { + variable = !variable + , file = matlabName () + } | Odual => Dual { + text = textName () + , matlab = { + variable = !variable + , file = matlabName () + } + } + end + end +end diff --git a/src/nbc/parse-state.mlb b/src/nbc/parse-state.mlb new file mode 100644 index 0000000..6b0ccf3 --- /dev/null +++ b/src/nbc/parse-state.mlb @@ -0,0 +1,5 @@ +local + $(SML_LIB)/basis/basis.mlb +in + parse-state.sml +end diff --git a/src/nbc/parse-state.sml b/src/nbc/parse-state.sml new file mode 100644 index 0000000..8b30a67 --- /dev/null +++ b/src/nbc/parse-state.sml @@ -0,0 +1,89 @@ +signature PARSE_STATE = sig + type ('argument, 'result) state + val create: unit -> ('argument, 'result) state + type ('argument, 'result) handler = + TextIO.instream * 'argument -> 'result + datatype whichCharacters = These of char list | Any + val build: { + state: ('argument, 'result) state + , characters: + (whichCharacters * ('argument, 'result) handler) list + , endOfFile: ('argument, 'result) handler + } -> unit + val enter: + ('argument, 'result) state + * TextIO.instream + * 'argument + -> 'result +end + +structure ParseState :> PARSE_STATE = struct + type ('argument, 'result) handler = + TextIO.instream * 'argument -> 'result + datatype whichCharacters = These of char list | Any + type ('argument, 'result) state = { + byCharacter: Int8.int vector ref + , byIndex: ('argument, 'result) handler vector ref + , endOfFile: ('argument, 'result) handler option ref + } + fun create () = { + byCharacter = ref (Vector.fromList nil) + , byIndex = ref (Vector.fromList nil) + , endOfFile = ref NONE + } + fun build { + state = {byCharacter, byIndex, endOfFile} + , characters + , endOfFile = newEndOfFile + } = + let + val characters = vector characters + fun equal (one: char) (two: char) = + one = two + fun shallHandle ((whichToHandle, _), char) = + case whichToHandle of + Any => true + | These charactersToHandle => + List.exists (equal char) + charactersToHandle + fun charToIndex char = + case + Vector.findi (fn (_, handler) => + shallHandle (handler, char) + ) characters + of + NONE => raise Fail ( + "ParseState.build: " + ^ Char.toString char + ^ " not found" + ) | SOME (index, _) => + Int8.fromInt index + fun handlerToFunction (_, function) = function + fun indexToFunction index = handlerToFunction ( + Vector.sub (characters, index) + ) + in + byCharacter := Vector.tabulate ( + Char.maxOrd + 1 + , charToIndex o chr + ); byIndex := + Vector.map (fn (_, function) => + function + ) characters + ; endOfFile := SOME newEndOfFile + end + fun enter ( + { + byCharacter = ref byCharacter + , byIndex = ref byIndex + , endOfFile = ref endOfFile + } + , instream + , argument + ) = case TextIO.input1 instream of + NONE => (valOf endOfFile) (instream, argument) + | SOME char => Vector.sub ( + byIndex + , Int8.toInt (Vector.sub (byCharacter, ord char)) + ) (instream, argument) +end diff --git a/src/nbc/probabilities-by-read b/src/nbc/probabilities-by-read new file mode 100755 index 0000000..ccdb007 Binary files /dev/null and b/src/nbc/probabilities-by-read differ diff --git a/src/nbc/probabilities-by-read.mlb b/src/nbc/probabilities-by-read.mlb new file mode 100644 index 0000000..f487180 --- /dev/null +++ b/src/nbc/probabilities-by-read.mlb @@ -0,0 +1,8 @@ +local + $(SML_LIB)/basis/basis.mlb + $(SML_LIB)/smlnj-lib/Util/smlnj-lib.mlb + nmer.mlb + fasta.mlb +in + probabilities-by-read.sml +end diff --git a/src/nbc/probabilities-by-read.sml b/src/nbc/probabilities-by-read.sml new file mode 100644 index 0000000..acd8041 --- /dev/null +++ b/src/nbc/probabilities-by-read.sml @@ -0,0 +1,210 @@ +local + fun usage result = ( + TextIO.output ( + TextIO.stdErr + , CommandLine.name () + ^ " \n" + ); OS.Process.exit result + ) +in + val (order, input, toCount) = case CommandLine.arguments () of + [order, input, toCount] => (case Int.fromString order of + NONE => usage OS.Process.failure + | SOME order => (order, input, toCount) + ) | ["--help"] => usage OS.Process.success + | _ => usage OS.Process.failure +end + +datatype result = Success | Failure +fun warn message = TextIO.output ( + TextIO.stdErr + , ( + "warning: " + ^ message + ^ "\n" + ) +) + +signature NMER_TABLE = sig + type nmer + type table + + (* + Create the table. Only the provided nmers will be counted. + *) + val create: nmer list -> table + + (* + Increment the count for a given nmer. If the nmer was + not provided at table creation, do nothing. + *) + val bump: table * nmer -> unit + + (* + Reset all counts to zero. Do not change the list of + nmers to count. + *) + val clear: table -> unit + + (* + Apply a function to all nmers and their counts, in + lexicographic order. + *) + val app: (nmer * int -> unit) -> table -> unit +end + +functor NmerTable (Nmer: NMER) +:> NMER_TABLE where type nmer = Nmer.nmer += struct + structure HashTable = HashTableFn ( + type hash_key = Nmer.nmer + val hashVal = Nmer.hash + val sameKey = Nmer.equal + ) + exception NotFound + type nmer = Nmer.nmer + type table = { + indexes: int HashTable.hash_table + , counts: int array + , nmers: nmer vector + } + fun create list = + let + val indexes = HashTable.mkTable (1024, NotFound) + val nmers = + let + val array = Array.fromList list + in + ArrayQSort.sort Nmer.compare array + ; Array.vector array + end + in + Vector.appi (fn (index, nmer) => + HashTable.insert indexes (nmer, index) + ) nmers + ; { + indexes = indexes + , nmers = nmers + , counts = Array.array ( + Vector.length nmers + , 0 + ) + } + end + fun bump ({indexes, nmers = _, counts}, nmer) = + case HashTable.find indexes nmer of + NONE => () + | SOME index => Array.update ( + counts + , index + , Array.sub (counts, index) + 1 + ) + fun clear {indexes = _, nmers = _, counts} = + Array.modify (fn _ => 0) counts + fun app execute {indexes = _, nmers, counts} = + Vector.appi (fn (index, nmer) => + execute (nmer, Array.sub (counts, index)) + ) nmers +end + +functor Input ( + structure Nmer: NMER + structure NmerTable: NMER_TABLE + sharing type Nmer.nmer = NmerTable.nmer +) = SingleSidedFasta ( + structure Nmer = Nmer + structure File = struct + type argument = NmerTable.table + type file = argument + type read = Int64.int ref + type nmer = Nmer.nmer + datatype result = datatype result + exception NotFound + fun startFile table = table + fun startRead (table, _) = ref (0: Int64.int) + fun nmer (table, (total: Int64.int ref), nmer) = ( + total := !total + 1 + ; NmerTable.bump (table, nmer) + ) + fun put string = TextIO.output (TextIO.stdOut, string) + fun stopRead (table, total) = + let + val realFromInt64 = + Real.fromLargeInt o Int64.toLarge + val realTotal = realFromInt64 (!total) + val toString = Real.fmt ( + StringCvt.FIX (SOME 17) + ) + fun probability count = real count / realTotal + infix |> + fun argument |> function = function argument + val first = ref true + in + NmerTable.app (fn (nmer, count) => ( + if !first then first := false + else put "\t" + ; count + |> probability + |> toString + |> put + )) table + ; put "\n" + ; NmerTable.clear table + ; total := 0 + end + fun stopFile _ = Success + fun invalidFormat _ = Failure + end +) + +structure Nmer = Nmer ( + val order = order + structure Word = Word64 +) +structure NmerTable = NmerTable (Nmer) +structure Input = Input ( + structure Nmer = Nmer + structure NmerTable = NmerTable +) + +val table = + let + fun build collect goose = case collect goose of + NONE => nil + | SOME egg => + egg :: build collect goose + fun chopEnd string = String.extract ( + string + , 0 + , SOME (size string - ( + if String.isSuffix "\r\n" string + then 2 + else 1 + )) + ) + val line = Option.map chopEnd o TextIO.inputLine + val lines = build line + val instream = TextIO.openIn toCount + fun fromStringWithWarning string = + case Nmer.fromString string of + NONE => ( + warn ( + string + ^ " is not a valid nmer" + ); NONE + ) | someNmer => someNmer + val nmers = List.mapPartial + fromStringWithWarning + (lines instream) + in + TextIO.closeIn instream + ; NmerTable.create nmers + end + +val () = case Input.process (table, TextIO.openIn input) of + Failure => ( + TextIO.output ( + TextIO.stdErr + , "input is not valid FASTA\n" + ); OS.Process.exit OS.Process.failure + ) | Success => () diff --git a/src/nbc/program.sml b/src/nbc/program.sml new file mode 100644 index 0000000..df787ad --- /dev/null +++ b/src/nbc/program.sml @@ -0,0 +1,9 @@ +signature PROGRAM = sig + val version: string + val name: string +end + +structure Program :> PROGRAM = struct + val version = "2.0" + val name = "Naive Bayes Classifier - Score" +end diff --git a/src/nbc/promise.mlb b/src/nbc/promise.mlb new file mode 100644 index 0000000..7e1f71f --- /dev/null +++ b/src/nbc/promise.mlb @@ -0,0 +1,5 @@ +local + $(SML_LIB)/basis/basis.mlb +in + promise.sml +end diff --git a/src/nbc/promise.sml b/src/nbc/promise.sml new file mode 100644 index 0000000..6bb2655 --- /dev/null +++ b/src/nbc/promise.sml @@ -0,0 +1,24 @@ +structure Promise +:> sig + type 'fulfillment promise + val delay: (unit -> 'fulfillment) -> 'fulfillment promise + val force: 'fulfillment promise -> 'fulfillment +end = struct + local + datatype 'expectation lazy = + Delayed of unit -> 'expectation + | Forced of 'expectation + in + type 'expectation promise = 'expectation lazy ref + fun delay fulfill = ref (Delayed fulfill) + fun force promise = case !promise of + Delayed fulfill => + let + val expectation = fulfill () + in + promise := Forced expectation + ; expectation + end + | Forced expectation => expectation + end +end diff --git a/src/nbc/score.mlb b/src/nbc/score.mlb new file mode 100644 index 0000000..662bb83 --- /dev/null +++ b/src/nbc/score.mlb @@ -0,0 +1,27 @@ +$(SML_LIB)/basis/basis.mlb +$(SML_LIB)/basis/mlton.mlb +$(SML_LIB)/smlnj-lib/Util/smlnj-lib.mlb +$(SML_LIB)/mlyacc-lib/mlyacc-lib.mlb +fail.sml +sequence.sml +ann "allowFFI true" in + judy.sml + gzip.sml +end +substitution.grm.sig +substitution.grm.sml +substitution.lex.sml +substitution.sml +misc.sml +options.sml +storejudy.sml +genome.sml +nmer.sml +kahan.sml +score.sml +binary.sml +program.sml +matlab.sml +stopwatch.sml +fasta.sml +main.sml diff --git a/src/nbc/score.sml b/src/nbc/score.sml new file mode 100644 index 0000000..90d569a --- /dev/null +++ b/src/nbc/score.sml @@ -0,0 +1,30 @@ +signature SCORE = sig + val score: int * real * (string -> int option) * real * string -> real +end + +structure Score :> SCORE = struct + fun |> (x, f) = f x + infix |> + + fun addCount (hitsum, fcount, gcount, totalWords) = + Kahan.add (hitsum, Real.fromInt fcount * Math.ln (Real.fromInt gcount / totalWords)) + fun addNmer (totalWords, getGenomeCount) (nmer, ref fcount, (misses, anyhits, hitsum)) = + case getGenomeCount nmer of + NONE => (misses + 1, anyhits, hitsum) + | SOME gcount => ( + misses, true + , addCount (hitsum, fcount, gcount, totalWords) + ) + fun score (order, missConstant, getGenomeCount, totalWords, fragment) = + let + val add = addNmer (totalWords, getGenomeCount) + val seed = (0, false, Kahan.zero) + val (misses, anyhits, hitsum) = + Nmer.count (order, fragment) |> HashTable.foldi add seed + in + if anyhits then + Kahan.add (hitsum, Math.ln missConstant * Real.fromInt misses) + |> Kahan.sum + else Real.negInf + end +end diff --git a/src/nbc/sequence.sml b/src/nbc/sequence.sml new file mode 100644 index 0000000..73a3fbf --- /dev/null +++ b/src/nbc/sequence.sml @@ -0,0 +1,74 @@ +signature SEQUENCE = sig + type 'a t + val fold: ('a * 'b -> 'b) -> 'b -> 'a t -> 'b + val from: (unit -> 'a option) -> 'a t + val map: ('a -> 'b) -> 'a t -> 'b t + val app: ('a -> unit) -> 'a t -> unit + val fromArray: 'a array -> 'a t + val fromList: 'a list -> 'a t + val toList: 'a t -> 'a list + val toArray: 'a t -> 'a array +end + +structure Sequence :> SEQUENCE = struct + type 'a t = unit -> 'a option + fun fold f seed t = + let + fun loop x = + case t () of + NONE => x + | SOME y => loop (f (y, x)) + in + loop seed + end + fun from t = t + fun map f t = + fn () => ( + case t () of + NONE => NONE + | SOME x => SOME (f x) + ) + fun app f t = + let + fun loop () = + case t () of + NONE => () + | SOME x => ( + f x + ; loop () + ) + in + loop () + end + fun fromArray a = + let + val i = ref 0 + fun f () = + if !i >= Array.length a then NONE + else + SOME (Array.sub (a, !i)) + before i := !i + 1 + in + from f + end + fun fromList l = + let + val c = ref l + fun f () = + case !c of + x :: y => (c := y; SOME x) + | nil => NONE + in + from f + end + fun toList t = + let + fun loop l = + case t () of + NONE => rev l + | SOME x => loop (x :: l) + in + loop nil + end + fun toArray t = Array.fromList (toList t) +end diff --git a/src/nbc/stopwatch.sml b/src/nbc/stopwatch.sml new file mode 100644 index 0000000..43547ed --- /dev/null +++ b/src/nbc/stopwatch.sml @@ -0,0 +1,24 @@ +signature STOPWATCH = sig + exception FinishWithoutStart + val start: string -> unit + val finish: unit -> unit +end +structure Stopwatch :> STOPWATCH = struct + exception FinishWithoutStart + local + val time = ref NONE + in + fun start doing = ( + print (concat [doing, "..."]) + ; time := SOME (Time.now ()) + ) + fun finish () = case !time of + SOME t => ( + print (concat [ + " done in " + , Time.toString (Time.- (Time.now (), t)) + , " seconds.\n" + ]); time := NONE + ) | NONE => raise FinishWithoutStart + end +end diff --git a/src/nbc/storejudy.sml b/src/nbc/storejudy.sml new file mode 100644 index 0000000..c6385be --- /dev/null +++ b/src/nbc/storejudy.sml @@ -0,0 +1,17 @@ +signature STORE_JUDY = sig + type t + val load: (int * string) Sequence.t -> t + val get: t * string -> int option +end + +structure StoreJudy :> STORE_JUDY = struct + type t = Judy.t + fun load e = + let + val j = Judy.create () + in + Sequence.app (fn (count, nmer) => Judy.insert (j, nmer, count)) e + ; j + end + val get = Judy.get +end diff --git a/src/nbc/stream.mlb b/src/nbc/stream.mlb new file mode 100644 index 0000000..639facc --- /dev/null +++ b/src/nbc/stream.mlb @@ -0,0 +1,6 @@ +local + $(SML_LIB)/basis/basis.mlb + promise.mlb +in + stream.sml +end diff --git a/src/nbc/stream.sml b/src/nbc/stream.sml new file mode 100644 index 0000000..00b60ab --- /dev/null +++ b/src/nbc/stream.sml @@ -0,0 +1,243 @@ +signature STREAM = sig + type 'element stream + val create: + (unit -> ('element * 'element stream) option) + -> 'element stream + val empty: unit -> 'element stream + val cons: 'element * 'element stream -> 'element stream + val unfold: + ('seed -> ('fruit * 'seed) option) + -> 'seed + -> 'fruit stream + val getItem: 'element stream -> ('element * 'element stream) option + val isEmpty: 'element stream -> bool + val fold: + ('element * 'accumulation -> 'accumulation) + -> 'accumulation + -> 'element stream + -> 'accumulation + val length: 'element stream -> int + val rev: 'element stream -> 'element stream + val map: ('input -> 'output) -> 'input stream -> 'output stream + val mapPartial: + ('input -> 'output option) + -> 'input stream + -> 'output stream + val app: ('element -> unit) -> 'element stream -> unit + val toList: 'element stream -> 'element list + val fromList: 'element list -> 'element stream + val toVector: 'element stream -> 'element vector + val fromVector: 'element vector -> 'element stream + val fromVectorSlice: 'element VectorSlice.slice -> 'element stream + val toArray: 'element stream -> 'element array + val fromArray: 'element array -> 'element stream + val fromArraySlice: 'element ArraySlice.slice -> 'element stream + val fromString: string -> char stream + val fromSubstring: Substring.substring -> char stream + val toString: char stream -> string + val fromTextInstream: TextIO.instream -> char stream + val append: 'element stream * 'element stream -> 'element stream + val concat: 'element stream stream -> 'element stream + val hd: 'element stream -> 'element + val tl: 'element stream -> 'element stream + val find: ('element -> bool) -> 'element stream -> 'element option + val filter: ('element -> bool) -> 'element stream -> 'element stream + val exists: ('element -> bool) -> 'element stream -> bool + val all: ('element -> bool) -> 'element stream -> bool + val partition: + ('element -> bool) + -> 'element stream + -> 'element stream * 'element stream + val take: ('element -> bool) -> 'element stream -> 'element stream + val drop: ('element -> bool) -> 'element stream -> 'element stream + val split: + ('element -> bool) + -> 'element stream + -> 'element stream * 'element stream + val trim: 'element stream * int -> 'element stream + val tokens: + ('element -> bool) + -> 'element stream + -> 'element stream stream + val fields: + ('element -> bool) + -> 'element stream + -> 'element stream stream +end + +structure Stream :> STREAM = struct + datatype 'element stream = + T of unit -> ('element * 'element stream) option + fun create function = T function + fun empty () = create (fn () => NONE) + fun cons headAndTail = create (fn () => SOME headAndTail) + fun unfold harvest seed = create (fn () => + case harvest seed of + NONE => NONE + | SOME (fruit, seed) => SOME ( + fruit + , unfold harvest seed + ) + ) + fun getItem (T function) = function () + fun fromList list = unfold List.getItem list + fun toList stream = case getItem stream of + NONE => nil + | SOME (head, tail) => head :: toList tail + fun fold accumulate accumulation stream = case getItem stream of + NONE => accumulation + | SOME (head, tail) => + fold accumulate (accumulate (head, accumulation)) tail + fun length stream = fold (fn (_, count) => count + 1) 0 stream + fun rev stream = fromList (fold op :: nil stream) + fun map transform stream = unfold (fn stream => + case getItem stream of + NONE => NONE + | SOME (head, tail) => SOME ( + transform head + , tail + ) + ) stream + fun app execute stream = + fold (fn (element, ()) => execute element) () stream + fun fromVectorSlice slice = unfold VectorSlice.getItem slice + fun fromVector vector = fromVectorSlice (VectorSlice.full vector) + fun fromArraySlice slice = unfold ArraySlice.getItem slice + fun fromArray array = fromArraySlice (ArraySlice.full array) + fun fromSubstring substring = unfold Substring.getc substring + fun fromString string = fromSubstring (Substring.full string) + local + fun withTabulate tabulate stream = + let + val position = ref stream + in + tabulate ( + length stream + , fn _ => case getItem (!position) of + NONE => raise Fail "Stream" + | SOME (head, tail) => ( + position := tail + ; head + ) + ) + end + in + fun toVector stream = withTabulate Vector.tabulate stream + fun toArray stream = withTabulate Array.tabulate stream + fun toString stream = withTabulate CharVector.tabulate stream + end + fun fromTextInstream instream = + unfold TextIO.StreamIO.input1 (TextIO.getInstream instream) + fun append (first, second) = create (fn () => + case getItem first of + NONE => getItem second + | SOME (head, tail) => SOME ( + head + , append (tail, second) + ) + ) + fun concat streams = create (fn () => + case getItem streams of + NONE => NONE + | SOME (head, tail) => + getItem (append (head, concat tail)) + ) + fun hd stream = case getItem stream of + NONE => raise Empty + | SOME (head, _) => head + fun tl stream = case getItem stream of + NONE => raise Empty + | SOME (_, tail) => tail + fun last stream = hd (rev stream) + fun drop (stream, count) = + if count < 0 then raise Subscript + else if count = 0 then stream + else case getItem stream of + NONE => raise Subscript + | SOME (_, tail) => drop (tail, count - 1) + fun nth streamAndOffset = case getItem (drop streamAndOffset) of + NONE => raise Subscript + | SOME (head, _) => head + fun mapPartial transform stream = create (fn () => + case getItem stream of + NONE => NONE + | SOME (head, tail) => case transform head of + NONE => getItem (mapPartial transform tail) + | SOME element => SOME ( + element + , mapPartial transform tail + ) + ) + fun find test stream = case getItem stream of + NONE => NONE + | SOME (head, tail) => + if test head then SOME head + else find test tail + fun filter test stream = unfold (fn stream => + case getItem stream of + NONE => NONE + | someHeadAndTail as (SOME (head, tail)) => + if test head then someHeadAndTail + else getItem (filter test tail) + ) stream + fun exists test stream = case find test stream of + NONE => false + | SOME _ => true + fun all test stream = not (exists (not o test) stream) + fun partition test stream = + let + val withResult = map (fn element => + (test element, element) + ) stream + in ( + mapPartial (fn (result, element) => + if result then SOME element + else NONE + ) withResult + , mapPartial (fn (result, element) => + if result then NONE + else SOME element + ) withResult + ) end + fun take test stream = create (fn () => + case getItem stream of + NONE => NONE + | SOME (head, tail) => + if test head then SOME (head, take test tail) + else NONE + ) + fun drop test stream = create (fn () => + case getItem stream of + NONE => NONE + | someHeadAndTail as (SOME (head, tail)) => + if test head then getItem (drop test tail) + else someHeadAndTail + ) + fun split test stream = (take test stream, drop test stream) + fun trim (stream, count) = + if count <= 0 then stream + else create (fn () => + case getItem stream of + NONE => NONE + | SOME (_, tail) => + getItem (trim (tail, count - 1)) + ) + fun isEmpty stream = case getItem stream of + NONE => true + | SOME _ => false + fun tokens isSeparator stream = unfold (fn stream => + let + val skipped = drop isSeparator stream + in + if isEmpty skipped then NONE + else SOME (split (not o isSeparator) skipped) + end + ) stream + fun fields isSeparator stream = unfold (fn stream => + if isEmpty stream then NONE + else SOME ( + take (not o isSeparator) stream + , trim (drop (not o isSeparator) stream, 1) + ) + ) stream +end diff --git a/src/nbc/substitution.grm b/src/nbc/substitution.grm new file mode 100644 index 0000000..58505b5 --- /dev/null +++ b/src/nbc/substitution.grm @@ -0,0 +1,50 @@ +%% +%name SubstitutionGrm +%pos int +%arg (lookup): string -> string +%term + DOLLAR + | TEXT of string + | LEFT_PARENTHESIS + | LEFT_BRACE + | RIGHT_PARENTHESIS + | RIGHT_BRACE + | EOF +%nonterm + STRING of string + | LIST of string list + | PORTION of string + | VARIABLE of string + | PARENTHESIZED of string list + | INSIDE_PARENTHESES of string list + | BRACED of string list + | INSIDE_BRACES of string list +%eop EOF +%noshift EOF +%start STRING +%% +STRING: LIST (concat LIST) +LIST: + (nil) + | PORTION LIST (PORTION :: LIST) +PORTION: + TEXT (TEXT) + | VARIABLE (lookup VARIABLE) +VARIABLE: + DOLLAR TEXT (TEXT) + | PARENTHESIZED (concat PARENTHESIZED) + | BRACED (concat BRACED) +PARENTHESIZED: + LEFT_PARENTHESIS INSIDE_PARENTHESES RIGHT_PARENTHESIS (INSIDE_PARENTHESES) +INSIDE_PARENTHESES: + (nil) + | TEXT INSIDE_PARENTHESES (TEXT :: INSIDE_PARENTHESES) + | LEFT_PARENTHESIS INSIDE_PARENTHESES RIGHT_PARENTHESIS INSIDE_PARENTHESES + ("(" :: INSIDE_PARENTHESES1 @ ")" :: INSIDE_PARENTHESES2) +BRACED: + LEFT_BRACE INSIDE_BRACES RIGHT_BRACE (INSIDE_BRACES) +INSIDE_BRACES: + (nil) + | TEXT INSIDE_BRACES (TEXT :: INSIDE_BRACES) + | LEFT_BRACE INSIDE_BRACES RIGHT_BRACE INSIDE_BRACES + ("{" :: INSIDE_BRACES1 @ "}" :: INSIDE_BRACES2) diff --git a/src/nbc/substitution.lex b/src/nbc/substitution.lex new file mode 100644 index 0000000..e460121 --- /dev/null +++ b/src/nbc/substitution.lex @@ -0,0 +1,54 @@ +type svalue = Tokens.svalue +type ('a, 'b) token = ('a, 'b) Tokens.token +type pos = int +type lexresult = (svalue, pos) token +type arg = int ref +fun eof _ = Tokens.EOF (~1, ~1) +%% +%full +%header (functor SubstitutionLexFun (structure Tokens: SubstitutionGrm_TOKENS)); +%arg (nesting); +%s VARIABLE PARENTHESIZED BRACED; +text = ([^$] | "\\$")+; +dollar = "$"; +leftparenthesis = "("; +rightparenthesis = ")"; +notparenthesis = [^()]+; +leftbrace = "{"; +rightbrace = "}"; +notbrace = [^{}]+; +other = [A-Za-z0-9_]+; +%% +{text} => (Tokens.TEXT (yytext, yypos, yypos + size yytext)); +{dollar} => (YYBEGIN VARIABLE; Tokens.DOLLAR (yypos, yypos + 1)); +{dollar}{leftparenthesis} => ( + YYBEGIN PARENTHESIZED + ; nesting := !nesting + 1 + ; Tokens.LEFT_PARENTHESIS (yypos, yypos + 1) +); +{dollar}{leftbrace} => ( + YYBEGIN BRACED + ; nesting := !nesting + 1 + ; Tokens.LEFT_BRACE (yypos, yypos + 1) +); +{other} => (YYBEGIN INITIAL; Tokens.TEXT (yytext, yypos, yypos + size yytext)); +{notparenthesis} => (Tokens.TEXT (yytext, yypos, yypos + size yytext)); +{leftparenthesis} => ( + nesting := !nesting + 1 + ; Tokens.LEFT_PARENTHESIS (yypos, yypos + 1) +); +{rightparenthesis} => ( + nesting := !nesting - 1 + ; if !nesting = 0 then YYBEGIN INITIAL else () + ; Tokens.RIGHT_PARENTHESIS (yypos, yypos + 1) +); +{notbrace} => (Tokens.TEXT (yytext, yypos, yypos + size yytext)); +{leftbrace} => ( + nesting := !nesting + 1 + ; Tokens.LEFT_BRACE (yypos, yypos + 1) +); +{rightbrace} => ( + nesting := !nesting - 1 + ; if !nesting = 0 then YYBEGIN INITIAL else () + ; Tokens.RIGHT_BRACE (yypos, yypos + 1) +); diff --git a/src/nbc/substitution.sml b/src/nbc/substitution.sml new file mode 100644 index 0000000..57d55cf --- /dev/null +++ b/src/nbc/substitution.sml @@ -0,0 +1,26 @@ +signature SUBSTITUTION = sig + val substitute: (string -> string) -> string -> string option +end + +structure Substitution :> SUBSTITUTION = struct + structure LrVals = SubstitutionGrmLrValsFun (structure Token = LrParser.Token) + structure Lex = SubstitutionLexFun (structure Tokens = LrVals.Tokens) + structure Parser = JoinWithArg ( + structure ParserData = LrVals.ParserData + structure Lex = Lex + structure LrParser = LrParser + ) + fun substitute lookup source = + let + val position = ref 0 + val instream = TextIO.openString source + fun read n = TextIO.inputN (instream, n) + val lexer = Parser.makeLexer read (ref 0) + fun error (_, _, _) = () + val (result, _) = Parser.parse (0, lexer, error, lookup) + val () = TextIO.closeIn instream + in + SOME result + end + handle Parser.ParseError => NONE +end diff --git a/src/nbc/tabulate.ml b/src/nbc/tabulate.ml new file mode 100644 index 0000000..dbfbe43 --- /dev/null +++ b/src/nbc/tabulate.ml @@ -0,0 +1,190 @@ +let (|>) a b = b a +let car, cdr = fst, snd +let odd n = n land 1 = 1 +let even n = not (odd n) +module ExtArray = ExtArray.Array +module ExtList = ExtList.List +module ExtString = ExtString.String +module Program = struct + let name = "Naive Bayes Classifier - Tabulate" + let version = "1.0" +end +let chop_extra s = ExtString.strip ~chars:"_- " s +let any_alphanumeric s = + let e = String.length s in + let rec loop i = + if i = e then false + else match s.[i] with + 'a'..'z' | 'A'..'Z' | '0'..'9' -> true + | _ -> loop (i + 1) + in loop 0 +let guess_prefix filenames = + let s = + List.map Misc.basename_without_extension filenames + |> Misc.longest_common_substring |> chop_extra + in + if any_alphanumeric s then s + else "" +module Options = struct + let parser = OptParse.OptParser.make ~version:Program.version () + let genomes = + let option = OptParse.StdOpt.str_option ~metavar:"file" () in + OptParse.OptParser.add parser ~short_name:'g' ~long_name:"genome-list" option; + (fun () -> match OptParse.Opt.opt option with + Some x -> x + | None -> + OptParse.OptParser.usage parser (); + exit 1 + ) + let columns = + let option = OptParse.StdOpt.int_option ~default:200 () in + OptParse.OptParser.add parser ~short_name:'c' ~long_name:"columns" option; + (fun () -> OptParse.Opt.get option) + let template = + let option = OptParse.StdOpt.str_option ~metavar:"template" + ~default:"$prefix-$n.csv.gz" () in + OptParse.OptParser.add parser ~short_name:'t' ~long_name:"output-template" option; + (fun () -> OptParse.Opt.get option) + let given_prefix = + let x = ref None in + OptParse.StdOpt.str_callback ~metavar:"prefix" (fun s -> x := Some s) + |> OptParse.OptParser.add parser ~short_name:'p' + ~long_name:"output-prefix"; + (fun () -> !x) + let files = OptParse.OptParser.parse_argv parser + let prefix = match given_prefix () with + None -> ( + match guess_prefix files with + "" -> "table" + | s -> s + ) | Some x -> x + let string_of_int n = Printf.sprintf "%08u" n + let output n = template () |> Misc.substitute ["prefix", prefix; "n", string_of_int n] +end + +exception Collide of string * string * string +let match_files_to_genomes genomes files = + let g = Array.of_list genomes in + let gl = Array.length g in + let gi = Array.init gl (fun i -> i) in + Array.sort (fun ai bi -> compare (String.length g.(bi)) (String.length g.(ai))) gi; + let gf = Array.make gl None in + List.iter (fun file -> + try ( + let i = ExtArray.find (fun i -> ExtString.exists file g.(i)) gi in + match gf.(i) with + None -> gf.(i) <- Some file + | Some other_file -> raise (Collide (g.(i), other_file, file)) + ) with Not_found -> () (* file does not match any genomes *) + ) files; + let r = ref [] in + for i = gl - 1 downto 0 do + match gf.(i) with + None -> () (* no file matches a given genome *) + | Some file -> r := (g.(i), file) :: !r + done; + !r + +let columns = Options.columns () +let genomes, files = + let genomes = Misc.io_of_file (Options.genomes ()) |> Misc.enum_of_lines + |> ExtList.of_enum in + let g, f = match_files_to_genomes genomes Options.files |> List.split in + Array.of_list g, + f |> List.map (fun x -> x, x |> open_in |> IO.input_channel) |> Array.of_list + +let newfile, newline, cell, finish = + let i = ref 0 in + let open_file () = + i := !i + 1; + let filename = Options.output !i in + Gzip.open_out filename + in + let c = ref None in + let force_out () = + match !c with + None -> + let d = open_file () in + c := Some d; + d + | Some d -> d + in + let start_of_line = ref true in + (* newfile *) (fun () -> + (match !c with + Some c -> Gzip.close_out c + | None -> ()); + i := !i + 1; + c := None + ), + (* newline *) (fun () -> + let c = force_out () in + Gzip.output_char c '\n'; + start_of_line := true + ), + (* cell *) (fun s -> + let c = force_out () in + if not !start_of_line then Gzip.output_char c ','; + Gzip.output c s 0 (String.length s); + start_of_line := false + ), + (* finish *) (fun () -> + match !c with + Some c -> Gzip.close_out c + | None -> () + ) + +let read_two_fields (filename, c) = + let line = IO.read_line c in + match Misc.split2 line with + Some x -> x + | None -> + Printf.eprintf "\ + There is something wrong with the file %s. \ + The offending line is:\n%s\n" filename line; + exit 1 + +let output_file () = + newfile (); + let yes_we_have_input = + let x = ref false in + (fun () -> + if !x then () + else ( + cell "names"; + x := true + ) + ) + in + let a = Array.make columns "" in + let rec loop i = + if i < columns then ( + try ( + let name, datum = read_two_fields files.(0) in + yes_we_have_input (); + cell name; + a.(i) <- datum; + loop (i + 1) + ) with IO.No_more_input -> + if i > 0 then Array.sub a 0 i + else raise End_of_file + ) else a + in + let a = loop 0 in + let these_columns = Array.length a in + newline (); + cell genomes.(0); + for i = 0 to these_columns - 1 do cell a.(i) done; + newline (); + for i = 1 to Array.length genomes - 1 do + cell genomes.(i); + for j = 0 to these_columns - 1 do + let _, datum = read_two_fields files.(i) in + cell datum + done; + newline () + done + +let () = + try while true do output_file () done + with End_of_file -> finish () diff --git a/src/nbc/test-generate.sml b/src/nbc/test-generate.sml new file mode 100644 index 0000000..77cafd3 --- /dev/null +++ b/src/nbc/test-generate.sml @@ -0,0 +1 @@ +structure Test = Test () diff --git a/src/nbc/test-library.mlb b/src/nbc/test-library.mlb new file mode 100644 index 0000000..2618fdb --- /dev/null +++ b/src/nbc/test-library.mlb @@ -0,0 +1,5 @@ +local + $(SML_LIB)/basis/basis.mlb +in + test-library.sml +end diff --git a/src/nbc/test-library.sml b/src/nbc/test-library.sml new file mode 100644 index 0000000..6821e74 --- /dev/null +++ b/src/nbc/test-library.sml @@ -0,0 +1,50 @@ +signature TEST = sig + type test = + { + description: string + , expectedResult: string + , function: unit -> string + } + val single: test -> unit + val list: test list -> unit +end + +structure Test = struct + fun single {description, expectedResult, function} = + let + val actualResult = function () + in + if expectedResult = actualResult then + TextIO.output ( + TextIO.stdErr + , ( + description + ^ " succeeded.\n" + ) + ) + else ( + TextIO.output ( + TextIO.stdErr + , ( + description + ^ " was supposed to be " + ^ expectedResult + ^ ", but was actually " + ^ actualResult + ^ ".\n" + ) + ); OS.Process.exit OS.Process.failure + ) + end handle exception' => ( + TextIO.output ( + TextIO.stdErr + , ( + description + ^ " failed with exception " + ^ exnMessage exception' + ^ ".\n" + ) + ); OS.Process.exit OS.Process.failure + ) + fun list tests = app single tests +end diff --git a/src/nbc/tree.mlb b/src/nbc/tree.mlb new file mode 100644 index 0000000..4a496bf --- /dev/null +++ b/src/nbc/tree.mlb @@ -0,0 +1,6 @@ +local + $(SML_LIB)/basis/basis.mlb + stream.mlb +in + tree.sml +end diff --git a/src/nbc/tree.sml b/src/nbc/tree.sml new file mode 100644 index 0000000..5d64f51 --- /dev/null +++ b/src/nbc/tree.sml @@ -0,0 +1,225 @@ +functor Tree (Key: sig + type key + val compare: key * key -> order +end) :> sig + type key = Key.key + type 'datum tree + val size: 'datum tree -> int + val empty: 'datum tree + val single: key * 'datum -> 'datum tree + val add: 'datum tree * key * 'datum -> 'datum tree + val find: 'datum tree * key -> 'datum option + val exists: 'datum tree * key -> bool + val up: 'datum tree -> (key * 'datum) Stream.stream + val down: 'datum tree -> (key * 'datum) Stream.stream + val upFrom: 'datum tree * key -> (key * 'datum) Stream.stream + val downFrom: 'datum tree * key -> (key * 'datum) Stream.stream + val fromList: (key * 'datum) list -> 'datum tree + val fromStream: (key * 'datum) Stream.stream -> 'datum tree + (* + val remove: 'datum tree * key -> 'datum tree + *) +end = struct + type key = Key.key + structure Height = Int8 + datatype 'datum tree = + Leaf + | Branch of { + height: Height.int + , less: 'datum tree + , key: key + , datum: 'datum + , greater: 'datum tree + } + fun size tree = case tree of + Leaf => 0 + | Branch {less, greater, ...} => + 1 + size less + size greater + val empty = Leaf + fun single (key, datum) = Branch { + height = 1 + , less = Leaf + , key = key + , datum = datum + , greater = Leaf + } + fun height tree = case tree of + Leaf => 0 + | Branch {height, ...} => height + fun calculateHeight {key, datum, less, greater} = Branch { + key = key + , datum = datum + , less = less + , greater = greater + , height = 1 + Height.max (height less, height greater) + } + fun rotateLess branch = case branch of + (* + b d + a d => b e + c e a c + *) + { + less = a + , key = bKey + , datum = bDatum + , greater = Branch { + less = c + , key = dKey + , datum = dDatum + , greater = e + , height = _ + } + } => calculateHeight { + less = calculateHeight { + less = a + , key = bKey + , datum = bDatum + , greater = c + }, key = dKey + , datum = dDatum + , greater = e + } | _ => raise Fail "rotateLess" + fun rotateGreater branch = case branch of + (* + d b + b e => a d + a c c e + *) + { + less = Branch { + less = a + , key = bKey + , datum = bDatum + , greater = c + , height = _ + }, key = dKey + , datum = dDatum + , greater = e + } => calculateHeight { + less = a + , key = bKey + , datum = bDatum + , greater = calculateHeight { + less = c + , key = dKey + , datum = dDatum + , greater = e + } + } | _ => raise Fail "rotateGreater" + fun balance (branch as {key, datum, less, greater}) = + let + val heightLess = height less + val heightGreater = height greater + in + if heightLess < heightGreater - 2 then + rotateLess branch + else if heightLess > heightGreater + 2 then + rotateGreater branch + else calculateHeight branch + end + fun add (tree, newKey, newDatum) = case tree of + Leaf => single (newKey, newDatum) + | Branch {height, less, key, datum, greater} => + case Key.compare (newKey, key) of + EQUAL => Branch { + height = height + , less = less + , key = newKey + , datum = newDatum + , greater = greater + } | LESS => balance { + less = add (less, newKey, newDatum) + , key = key + , datum = datum + , greater = greater + } | GREATER => balance { + less = less + , key = key + , datum = datum + , greater = add (greater, newKey, newDatum) + } + fun find (tree, desiredKey) = case tree of + Leaf => NONE + | Branch {less, key, datum, greater, ...} => + case Key.compare (desiredKey, key) of + EQUAL => SOME datum + | LESS => find (less, desiredKey) + | GREATER => find (greater, desiredKey) + fun exists (tree, desiredKey) = case find (tree, desiredKey) of + NONE => false + | SOME _ => true + fun up tree = Stream.create (fn () => + case tree of + Leaf => NONE + | Branch {less, key, datum, greater, ...} => + Stream.getItem ( + Stream.append ( + up less + , Stream.cons ( + (key, datum) + , up greater + ) + ) + ) + ) + fun down tree = Stream.create (fn () => + case tree of + Leaf => NONE + | Branch {greater, key, datum, less, ...} => + Stream.getItem ( + Stream.append ( + down greater + , Stream.cons ( + (key, datum) + , down less + ) + ) + ) + ) + fun upFrom (tree, firstKey) = Stream.create (fn () => + case tree of + Leaf => NONE + | Branch {less, key, datum, greater, ...} => + case Key.compare (firstKey, key) of + LESS => Stream.getItem ( + Stream.append ( + upFrom (less, firstKey) + , Stream.cons ( + (key, datum) + , up greater + ) + ) + ) | EQUAL => SOME ( + (key, datum) + , up greater + ) | GREATER => Stream.getItem ( + upFrom (greater, firstKey) + ) + ) + fun downFrom (tree, firstKey) = Stream.create (fn () => + case tree of + Leaf => NONE + | Branch {greater, key, datum, less, ...} => + case Key.compare (firstKey, key) of + LESS => Stream.getItem ( + downFrom (less, firstKey) + ) | EQUAL => SOME ( + (key, datum) + , down less + ) | GREATER => Stream.getItem ( + Stream.append ( + downFrom (greater, firstKey) + , Stream.cons ( + (key, datum) + , down less + ) + ) + ) + ) + fun fromStream stream = + Stream.fold (fn ((key, datum), tree) => + add (tree, key, datum) + ) empty stream + fun fromList list = fromStream (Stream.fromList list) +end -- cgit v1.2.3