1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/.hgignore Tue Jan 24 23:54:05 2012 +0000
1.3 @@ -0,0 +1,22 @@
1.4 +gutcheck-.*\.tar\.gz
1.5 +gutcheck-.*/
1.6 +Makefile$
1.7 +Makefile\.in
1.8 +aclocal\.m4
1.9 +libtool
1.10 +stamp-h1
1.11 +autom4te\.cache
1.12 +config\.log
1.13 +config\.status
1.14 +config/
1.15 +configure
1.16 +\.deps/
1.17 +\.libs/
1.18 +\..*\.swp
1.19 +.*\.o
1.20 +.*\.la
1.21 +.*\.lo
1.22 +.*\.exe
1.23 +gutcheck/gutcheck\.typ
1.24 +gutcheck/gutcheck
1.25 +test/harness/gc-test
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/COPYING Tue Jan 24 23:54:05 2012 +0000
2.3 @@ -0,0 +1,340 @@
2.4 + GNU GENERAL PUBLIC LICENSE
2.5 + Version 2, June 1991
2.6 +
2.7 + Copyright (C) 1989, 1991 Free Software Foundation, Inc.
2.8 + 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2.9 + Everyone is permitted to copy and distribute verbatim copies
2.10 + of this license document, but changing it is not allowed.
2.11 +
2.12 + Preamble
2.13 +
2.14 + The licenses for most software are designed to take away your
2.15 +freedom to share and change it. By contrast, the GNU General Public
2.16 +License is intended to guarantee your freedom to share and change free
2.17 +software--to make sure the software is free for all its users. This
2.18 +General Public License applies to most of the Free Software
2.19 +Foundation's software and to any other program whose authors commit to
2.20 +using it. (Some other Free Software Foundation software is covered by
2.21 +the GNU Library General Public License instead.) You can apply it to
2.22 +your programs, too.
2.23 +
2.24 + When we speak of free software, we are referring to freedom, not
2.25 +price. Our General Public Licenses are designed to make sure that you
2.26 +have the freedom to distribute copies of free software (and charge for
2.27 +this service if you wish), that you receive source code or can get it
2.28 +if you want it, that you can change the software or use pieces of it
2.29 +in new free programs; and that you know you can do these things.
2.30 +
2.31 + To protect your rights, we need to make restrictions that forbid
2.32 +anyone to deny you these rights or to ask you to surrender the rights.
2.33 +These restrictions translate to certain responsibilities for you if you
2.34 +distribute copies of the software, or if you modify it.
2.35 +
2.36 + For example, if you distribute copies of such a program, whether
2.37 +gratis or for a fee, you must give the recipients all the rights that
2.38 +you have. You must make sure that they, too, receive or can get the
2.39 +source code. And you must show them these terms so they know their
2.40 +rights.
2.41 +
2.42 + We protect your rights with two steps: (1) copyright the software, and
2.43 +(2) offer you this license which gives you legal permission to copy,
2.44 +distribute and/or modify the software.
2.45 +
2.46 + Also, for each author's protection and ours, we want to make certain
2.47 +that everyone understands that there is no warranty for this free
2.48 +software. If the software is modified by someone else and passed on, we
2.49 +want its recipients to know that what they have is not the original, so
2.50 +that any problems introduced by others will not reflect on the original
2.51 +authors' reputations.
2.52 +
2.53 + Finally, any free program is threatened constantly by software
2.54 +patents. We wish to avoid the danger that redistributors of a free
2.55 +program will individually obtain patent licenses, in effect making the
2.56 +program proprietary. To prevent this, we have made it clear that any
2.57 +patent must be licensed for everyone's free use or not licensed at all.
2.58 +
2.59 + The precise terms and conditions for copying, distribution and
2.60 +modification follow.
2.61 +
2.62 + GNU GENERAL PUBLIC LICENSE
2.63 + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
2.64 +
2.65 + 0. This License applies to any program or other work which contains
2.66 +a notice placed by the copyright holder saying it may be distributed
2.67 +under the terms of this General Public License. The "Program", below,
2.68 +refers to any such program or work, and a "work based on the Program"
2.69 +means either the Program or any derivative work under copyright law:
2.70 +that is to say, a work containing the Program or a portion of it,
2.71 +either verbatim or with modifications and/or translated into another
2.72 +language. (Hereinafter, translation is included without limitation in
2.73 +the term "modification".) Each licensee is addressed as "you".
2.74 +
2.75 +Activities other than copying, distribution and modification are not
2.76 +covered by this License; they are outside its scope. The act of
2.77 +running the Program is not restricted, and the output from the Program
2.78 +is covered only if its contents constitute a work based on the
2.79 +Program (independent of having been made by running the Program).
2.80 +Whether that is true depends on what the Program does.
2.81 +
2.82 + 1. You may copy and distribute verbatim copies of the Program's
2.83 +source code as you receive it, in any medium, provided that you
2.84 +conspicuously and appropriately publish on each copy an appropriate
2.85 +copyright notice and disclaimer of warranty; keep intact all the
2.86 +notices that refer to this License and to the absence of any warranty;
2.87 +and give any other recipients of the Program a copy of this License
2.88 +along with the Program.
2.89 +
2.90 +You may charge a fee for the physical act of transferring a copy, and
2.91 +you may at your option offer warranty protection in exchange for a fee.
2.92 +
2.93 + 2. You may modify your copy or copies of the Program or any portion
2.94 +of it, thus forming a work based on the Program, and copy and
2.95 +distribute such modifications or work under the terms of Section 1
2.96 +above, provided that you also meet all of these conditions:
2.97 +
2.98 + a) You must cause the modified files to carry prominent notices
2.99 + stating that you changed the files and the date of any change.
2.100 +
2.101 + b) You must cause any work that you distribute or publish, that in
2.102 + whole or in part contains or is derived from the Program or any
2.103 + part thereof, to be licensed as a whole at no charge to all third
2.104 + parties under the terms of this License.
2.105 +
2.106 + c) If the modified program normally reads commands interactively
2.107 + when run, you must cause it, when started running for such
2.108 + interactive use in the most ordinary way, to print or display an
2.109 + announcement including an appropriate copyright notice and a
2.110 + notice that there is no warranty (or else, saying that you provide
2.111 + a warranty) and that users may redistribute the program under
2.112 + these conditions, and telling the user how to view a copy of this
2.113 + License. (Exception: if the Program itself is interactive but
2.114 + does not normally print such an announcement, your work based on
2.115 + the Program is not required to print an announcement.)
2.116 +
2.117 +These requirements apply to the modified work as a whole. If
2.118 +identifiable sections of that work are not derived from the Program,
2.119 +and can be reasonably considered independent and separate works in
2.120 +themselves, then this License, and its terms, do not apply to those
2.121 +sections when you distribute them as separate works. But when you
2.122 +distribute the same sections as part of a whole which is a work based
2.123 +on the Program, the distribution of the whole must be on the terms of
2.124 +this License, whose permissions for other licensees extend to the
2.125 +entire whole, and thus to each and every part regardless of who wrote it.
2.126 +
2.127 +Thus, it is not the intent of this section to claim rights or contest
2.128 +your rights to work written entirely by you; rather, the intent is to
2.129 +exercise the right to control the distribution of derivative or
2.130 +collective works based on the Program.
2.131 +
2.132 +In addition, mere aggregation of another work not based on the Program
2.133 +with the Program (or with a work based on the Program) on a volume of
2.134 +a storage or distribution medium does not bring the other work under
2.135 +the scope of this License.
2.136 +
2.137 + 3. You may copy and distribute the Program (or a work based on it,
2.138 +under Section 2) in object code or executable form under the terms of
2.139 +Sections 1 and 2 above provided that you also do one of the following:
2.140 +
2.141 + a) Accompany it with the complete corresponding machine-readable
2.142 + source code, which must be distributed under the terms of Sections
2.143 + 1 and 2 above on a medium customarily used for software interchange; or,
2.144 +
2.145 + b) Accompany it with a written offer, valid for at least three
2.146 + years, to give any third party, for a charge no more than your
2.147 + cost of physically performing source distribution, a complete
2.148 + machine-readable copy of the corresponding source code, to be
2.149 + distributed under the terms of Sections 1 and 2 above on a medium
2.150 + customarily used for software interchange; or,
2.151 +
2.152 + c) Accompany it with the information you received as to the offer
2.153 + to distribute corresponding source code. (This alternative is
2.154 + allowed only for noncommercial distribution and only if you
2.155 + received the program in object code or executable form with such
2.156 + an offer, in accord with Subsection b above.)
2.157 +
2.158 +The source code for a work means the preferred form of the work for
2.159 +making modifications to it. For an executable work, complete source
2.160 +code means all the source code for all modules it contains, plus any
2.161 +associated interface definition files, plus the scripts used to
2.162 +control compilation and installation of the executable. However, as a
2.163 +special exception, the source code distributed need not include
2.164 +anything that is normally distributed (in either source or binary
2.165 +form) with the major components (compiler, kernel, and so on) of the
2.166 +operating system on which the executable runs, unless that component
2.167 +itself accompanies the executable.
2.168 +
2.169 +If distribution of executable or object code is made by offering
2.170 +access to copy from a designated place, then offering equivalent
2.171 +access to copy the source code from the same place counts as
2.172 +distribution of the source code, even though third parties are not
2.173 +compelled to copy the source along with the object code.
2.174 +
2.175 + 4. You may not copy, modify, sublicense, or distribute the Program
2.176 +except as expressly provided under this License. Any attempt
2.177 +otherwise to copy, modify, sublicense or distribute the Program is
2.178 +void, and will automatically terminate your rights under this License.
2.179 +However, parties who have received copies, or rights, from you under
2.180 +this License will not have their licenses terminated so long as such
2.181 +parties remain in full compliance.
2.182 +
2.183 + 5. You are not required to accept this License, since you have not
2.184 +signed it. However, nothing else grants you permission to modify or
2.185 +distribute the Program or its derivative works. These actions are
2.186 +prohibited by law if you do not accept this License. Therefore, by
2.187 +modifying or distributing the Program (or any work based on the
2.188 +Program), you indicate your acceptance of this License to do so, and
2.189 +all its terms and conditions for copying, distributing or modifying
2.190 +the Program or works based on it.
2.191 +
2.192 + 6. Each time you redistribute the Program (or any work based on the
2.193 +Program), the recipient automatically receives a license from the
2.194 +original licensor to copy, distribute or modify the Program subject to
2.195 +these terms and conditions. You may not impose any further
2.196 +restrictions on the recipients' exercise of the rights granted herein.
2.197 +You are not responsible for enforcing compliance by third parties to
2.198 +this License.
2.199 +
2.200 + 7. If, as a consequence of a court judgment or allegation of patent
2.201 +infringement or for any other reason (not limited to patent issues),
2.202 +conditions are imposed on you (whether by court order, agreement or
2.203 +otherwise) that contradict the conditions of this License, they do not
2.204 +excuse you from the conditions of this License. If you cannot
2.205 +distribute so as to satisfy simultaneously your obligations under this
2.206 +License and any other pertinent obligations, then as a consequence you
2.207 +may not distribute the Program at all. For example, if a patent
2.208 +license would not permit royalty-free redistribution of the Program by
2.209 +all those who receive copies directly or indirectly through you, then
2.210 +the only way you could satisfy both it and this License would be to
2.211 +refrain entirely from distribution of the Program.
2.212 +
2.213 +If any portion of this section is held invalid or unenforceable under
2.214 +any particular circumstance, the balance of the section is intended to
2.215 +apply and the section as a whole is intended to apply in other
2.216 +circumstances.
2.217 +
2.218 +It is not the purpose of this section to induce you to infringe any
2.219 +patents or other property right claims or to contest validity of any
2.220 +such claims; this section has the sole purpose of protecting the
2.221 +integrity of the free software distribution system, which is
2.222 +implemented by public license practices. Many people have made
2.223 +generous contributions to the wide range of software distributed
2.224 +through that system in reliance on consistent application of that
2.225 +system; it is up to the author/donor to decide if he or she is willing
2.226 +to distribute software through any other system and a licensee cannot
2.227 +impose that choice.
2.228 +
2.229 +This section is intended to make thoroughly clear what is believed to
2.230 +be a consequence of the rest of this License.
2.231 +
2.232 + 8. If the distribution and/or use of the Program is restricted in
2.233 +certain countries either by patents or by copyrighted interfaces, the
2.234 +original copyright holder who places the Program under this License
2.235 +may add an explicit geographical distribution limitation excluding
2.236 +those countries, so that distribution is permitted only in or among
2.237 +countries not thus excluded. In such case, this License incorporates
2.238 +the limitation as if written in the body of this License.
2.239 +
2.240 + 9. The Free Software Foundation may publish revised and/or new versions
2.241 +of the General Public License from time to time. Such new versions will
2.242 +be similar in spirit to the present version, but may differ in detail to
2.243 +address new problems or concerns.
2.244 +
2.245 +Each version is given a distinguishing version number. If the Program
2.246 +specifies a version number of this License which applies to it and "any
2.247 +later version", you have the option of following the terms and conditions
2.248 +either of that version or of any later version published by the Free
2.249 +Software Foundation. If the Program does not specify a version number of
2.250 +this License, you may choose any version ever published by the Free Software
2.251 +Foundation.
2.252 +
2.253 + 10. If you wish to incorporate parts of the Program into other free
2.254 +programs whose distribution conditions are different, write to the author
2.255 +to ask for permission. For software which is copyrighted by the Free
2.256 +Software Foundation, write to the Free Software Foundation; we sometimes
2.257 +make exceptions for this. Our decision will be guided by the two goals
2.258 +of preserving the free status of all derivatives of our free software and
2.259 +of promoting the sharing and reuse of software generally.
2.260 +
2.261 + NO WARRANTY
2.262 +
2.263 + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
2.264 +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
2.265 +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
2.266 +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
2.267 +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
2.268 +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
2.269 +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
2.270 +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
2.271 +REPAIR OR CORRECTION.
2.272 +
2.273 + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
2.274 +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
2.275 +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
2.276 +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
2.277 +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
2.278 +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
2.279 +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
2.280 +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
2.281 +POSSIBILITY OF SUCH DAMAGES.
2.282 +
2.283 + END OF TERMS AND CONDITIONS
2.284 +
2.285 + How to Apply These Terms to Your New Programs
2.286 +
2.287 + If you develop a new program, and you want it to be of the greatest
2.288 +possible use to the public, the best way to achieve this is to make it
2.289 +free software which everyone can redistribute and change under these terms.
2.290 +
2.291 + To do so, attach the following notices to the program. It is safest
2.292 +to attach them to the start of each source file to most effectively
2.293 +convey the exclusion of warranty; and each file should have at least
2.294 +the "copyright" line and a pointer to where the full notice is found.
2.295 +
2.296 + <one line to give the program's name and a brief idea of what it does.>
2.297 + Copyright (C) <year> <name of author>
2.298 +
2.299 + This program is free software; you can redistribute it and/or modify
2.300 + it under the terms of the GNU General Public License as published by
2.301 + the Free Software Foundation; either version 2 of the License, or
2.302 + (at your option) any later version.
2.303 +
2.304 + This program is distributed in the hope that it will be useful,
2.305 + but WITHOUT ANY WARRANTY; without even the implied warranty of
2.306 + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
2.307 + GNU General Public License for more details.
2.308 +
2.309 + You should have received a copy of the GNU General Public License
2.310 + along with this program; if not, write to the Free Software
2.311 + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
2.312 +
2.313 +
2.314 +Also add information on how to contact you by electronic and paper mail.
2.315 +
2.316 +If the program is interactive, make it output a short notice like this
2.317 +when it starts in an interactive mode:
2.318 +
2.319 + Gnomovision version 69, Copyright (C) year name of author
2.320 + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
2.321 + This is free software, and you are welcome to redistribute it
2.322 + under certain conditions; type `show c' for details.
2.323 +
2.324 +The hypothetical commands `show w' and `show c' should show the appropriate
2.325 +parts of the General Public License. Of course, the commands you use may
2.326 +be called something other than `show w' and `show c'; they could even be
2.327 +mouse-clicks or menu items--whatever suits your program.
2.328 +
2.329 +You should also get your employer (if you work as a programmer) or your
2.330 +school, if any, to sign a "copyright disclaimer" for the program, if
2.331 +necessary. Here is a sample; alter the names:
2.332 +
2.333 + Yoyodyne, Inc., hereby disclaims all copyright interest in the program
2.334 + `Gnomovision' (which makes passes at compilers) written by James Hacker.
2.335 +
2.336 + <signature of Ty Coon>, 1 April 1989
2.337 + Ty Coon, President of Vice
2.338 +
2.339 +This General Public License does not permit incorporating your program into
2.340 +proprietary programs. If your program is a subroutine library, you may
2.341 +consider it more useful to permit linking proprietary applications with the
2.342 +library. If this is what you want to do, use the GNU Library General
2.343 +Public License instead of this License.
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/INSTALL Tue Jan 24 23:54:05 2012 +0000
3.3 @@ -0,0 +1,365 @@
3.4 +Installation Instructions
3.5 +*************************
3.6 +
3.7 +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005,
3.8 +2006, 2007, 2008, 2009 Free Software Foundation, Inc.
3.9 +
3.10 + Copying and distribution of this file, with or without modification,
3.11 +are permitted in any medium without royalty provided the copyright
3.12 +notice and this notice are preserved. This file is offered as-is,
3.13 +without warranty of any kind.
3.14 +
3.15 +Basic Installation
3.16 +==================
3.17 +
3.18 + Briefly, the shell commands `./configure; make; make install' should
3.19 +configure, build, and install this package. The following
3.20 +more-detailed instructions are generic; see the `README' file for
3.21 +instructions specific to this package. Some packages provide this
3.22 +`INSTALL' file but do not implement all of the features documented
3.23 +below. The lack of an optional feature in a given package is not
3.24 +necessarily a bug. More recommendations for GNU packages can be found
3.25 +in *note Makefile Conventions: (standards)Makefile Conventions.
3.26 +
3.27 + The `configure' shell script attempts to guess correct values for
3.28 +various system-dependent variables used during compilation. It uses
3.29 +those values to create a `Makefile' in each directory of the package.
3.30 +It may also create one or more `.h' files containing system-dependent
3.31 +definitions. Finally, it creates a shell script `config.status' that
3.32 +you can run in the future to recreate the current configuration, and a
3.33 +file `config.log' containing compiler output (useful mainly for
3.34 +debugging `configure').
3.35 +
3.36 + It can also use an optional file (typically called `config.cache'
3.37 +and enabled with `--cache-file=config.cache' or simply `-C') that saves
3.38 +the results of its tests to speed up reconfiguring. Caching is
3.39 +disabled by default to prevent problems with accidental use of stale
3.40 +cache files.
3.41 +
3.42 + If you need to do unusual things to compile the package, please try
3.43 +to figure out how `configure' could check whether to do them, and mail
3.44 +diffs or instructions to the address given in the `README' so they can
3.45 +be considered for the next release. If you are using the cache, and at
3.46 +some point `config.cache' contains results you don't want to keep, you
3.47 +may remove or edit it.
3.48 +
3.49 + The file `configure.ac' (or `configure.in') is used to create
3.50 +`configure' by a program called `autoconf'. You need `configure.ac' if
3.51 +you want to change it or regenerate `configure' using a newer version
3.52 +of `autoconf'.
3.53 +
3.54 + The simplest way to compile this package is:
3.55 +
3.56 + 1. `cd' to the directory containing the package's source code and type
3.57 + `./configure' to configure the package for your system.
3.58 +
3.59 + Running `configure' might take a while. While running, it prints
3.60 + some messages telling which features it is checking for.
3.61 +
3.62 + 2. Type `make' to compile the package.
3.63 +
3.64 + 3. Optionally, type `make check' to run any self-tests that come with
3.65 + the package, generally using the just-built uninstalled binaries.
3.66 +
3.67 + 4. Type `make install' to install the programs and any data files and
3.68 + documentation. When installing into a prefix owned by root, it is
3.69 + recommended that the package be configured and built as a regular
3.70 + user, and only the `make install' phase executed with root
3.71 + privileges.
3.72 +
3.73 + 5. Optionally, type `make installcheck' to repeat any self-tests, but
3.74 + this time using the binaries in their final installed location.
3.75 + This target does not install anything. Running this target as a
3.76 + regular user, particularly if the prior `make install' required
3.77 + root privileges, verifies that the installation completed
3.78 + correctly.
3.79 +
3.80 + 6. You can remove the program binaries and object files from the
3.81 + source code directory by typing `make clean'. To also remove the
3.82 + files that `configure' created (so you can compile the package for
3.83 + a different kind of computer), type `make distclean'. There is
3.84 + also a `make maintainer-clean' target, but that is intended mainly
3.85 + for the package's developers. If you use it, you may have to get
3.86 + all sorts of other programs in order to regenerate files that came
3.87 + with the distribution.
3.88 +
3.89 + 7. Often, you can also type `make uninstall' to remove the installed
3.90 + files again. In practice, not all packages have tested that
3.91 + uninstallation works correctly, even though it is required by the
3.92 + GNU Coding Standards.
3.93 +
3.94 + 8. Some packages, particularly those that use Automake, provide `make
3.95 + distcheck', which can by used by developers to test that all other
3.96 + targets like `make install' and `make uninstall' work correctly.
3.97 + This target is generally not run by end users.
3.98 +
3.99 +Compilers and Options
3.100 +=====================
3.101 +
3.102 + Some systems require unusual options for compilation or linking that
3.103 +the `configure' script does not know about. Run `./configure --help'
3.104 +for details on some of the pertinent environment variables.
3.105 +
3.106 + You can give `configure' initial values for configuration parameters
3.107 +by setting variables in the command line or in the environment. Here
3.108 +is an example:
3.109 +
3.110 + ./configure CC=c99 CFLAGS=-g LIBS=-lposix
3.111 +
3.112 + *Note Defining Variables::, for more details.
3.113 +
3.114 +Compiling For Multiple Architectures
3.115 +====================================
3.116 +
3.117 + You can compile the package for more than one kind of computer at the
3.118 +same time, by placing the object files for each architecture in their
3.119 +own directory. To do this, you can use GNU `make'. `cd' to the
3.120 +directory where you want the object files and executables to go and run
3.121 +the `configure' script. `configure' automatically checks for the
3.122 +source code in the directory that `configure' is in and in `..'. This
3.123 +is known as a "VPATH" build.
3.124 +
3.125 + With a non-GNU `make', it is safer to compile the package for one
3.126 +architecture at a time in the source code directory. After you have
3.127 +installed the package for one architecture, use `make distclean' before
3.128 +reconfiguring for another architecture.
3.129 +
3.130 + On MacOS X 10.5 and later systems, you can create libraries and
3.131 +executables that work on multiple system types--known as "fat" or
3.132 +"universal" binaries--by specifying multiple `-arch' options to the
3.133 +compiler but only a single `-arch' option to the preprocessor. Like
3.134 +this:
3.135 +
3.136 + ./configure CC="gcc -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
3.137 + CXX="g++ -arch i386 -arch x86_64 -arch ppc -arch ppc64" \
3.138 + CPP="gcc -E" CXXCPP="g++ -E"
3.139 +
3.140 + This is not guaranteed to produce working output in all cases, you
3.141 +may have to build one architecture at a time and combine the results
3.142 +using the `lipo' tool if you have problems.
3.143 +
3.144 +Installation Names
3.145 +==================
3.146 +
3.147 + By default, `make install' installs the package's commands under
3.148 +`/usr/local/bin', include files under `/usr/local/include', etc. You
3.149 +can specify an installation prefix other than `/usr/local' by giving
3.150 +`configure' the option `--prefix=PREFIX', where PREFIX must be an
3.151 +absolute file name.
3.152 +
3.153 + You can specify separate installation prefixes for
3.154 +architecture-specific files and architecture-independent files. If you
3.155 +pass the option `--exec-prefix=PREFIX' to `configure', the package uses
3.156 +PREFIX as the prefix for installing programs and libraries.
3.157 +Documentation and other data files still use the regular prefix.
3.158 +
3.159 + In addition, if you use an unusual directory layout you can give
3.160 +options like `--bindir=DIR' to specify different values for particular
3.161 +kinds of files. Run `configure --help' for a list of the directories
3.162 +you can set and what kinds of files go in them. In general, the
3.163 +default for these options is expressed in terms of `${prefix}', so that
3.164 +specifying just `--prefix' will affect all of the other directory
3.165 +specifications that were not explicitly provided.
3.166 +
3.167 + The most portable way to affect installation locations is to pass the
3.168 +correct locations to `configure'; however, many packages provide one or
3.169 +both of the following shortcuts of passing variable assignments to the
3.170 +`make install' command line to change installation locations without
3.171 +having to reconfigure or recompile.
3.172 +
3.173 + The first method involves providing an override variable for each
3.174 +affected directory. For example, `make install
3.175 +prefix=/alternate/directory' will choose an alternate location for all
3.176 +directory configuration variables that were expressed in terms of
3.177 +`${prefix}'. Any directories that were specified during `configure',
3.178 +but not in terms of `${prefix}', must each be overridden at install
3.179 +time for the entire installation to be relocated. The approach of
3.180 +makefile variable overrides for each directory variable is required by
3.181 +the GNU Coding Standards, and ideally causes no recompilation.
3.182 +However, some platforms have known limitations with the semantics of
3.183 +shared libraries that end up requiring recompilation when using this
3.184 +method, particularly noticeable in packages that use GNU Libtool.
3.185 +
3.186 + The second method involves providing the `DESTDIR' variable. For
3.187 +example, `make install DESTDIR=/alternate/directory' will prepend
3.188 +`/alternate/directory' before all installation names. The approach of
3.189 +`DESTDIR' overrides is not required by the GNU Coding Standards, and
3.190 +does not work on platforms that have drive letters. On the other hand,
3.191 +it does better at avoiding recompilation issues, and works well even
3.192 +when some directory options were not specified in terms of `${prefix}'
3.193 +at `configure' time.
3.194 +
3.195 +Optional Features
3.196 +=================
3.197 +
3.198 + If the package supports it, you can cause programs to be installed
3.199 +with an extra prefix or suffix on their names by giving `configure' the
3.200 +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'.
3.201 +
3.202 + Some packages pay attention to `--enable-FEATURE' options to
3.203 +`configure', where FEATURE indicates an optional part of the package.
3.204 +They may also pay attention to `--with-PACKAGE' options, where PACKAGE
3.205 +is something like `gnu-as' or `x' (for the X Window System). The
3.206 +`README' should mention any `--enable-' and `--with-' options that the
3.207 +package recognizes.
3.208 +
3.209 + For packages that use the X Window System, `configure' can usually
3.210 +find the X include and library files automatically, but if it doesn't,
3.211 +you can use the `configure' options `--x-includes=DIR' and
3.212 +`--x-libraries=DIR' to specify their locations.
3.213 +
3.214 + Some packages offer the ability to configure how verbose the
3.215 +execution of `make' will be. For these packages, running `./configure
3.216 +--enable-silent-rules' sets the default to minimal output, which can be
3.217 +overridden with `make V=1'; while running `./configure
3.218 +--disable-silent-rules' sets the default to verbose, which can be
3.219 +overridden with `make V=0'.
3.220 +
3.221 +Particular systems
3.222 +==================
3.223 +
3.224 + On HP-UX, the default C compiler is not ANSI C compatible. If GNU
3.225 +CC is not installed, it is recommended to use the following options in
3.226 +order to use an ANSI C compiler:
3.227 +
3.228 + ./configure CC="cc -Ae -D_XOPEN_SOURCE=500"
3.229 +
3.230 +and if that doesn't work, install pre-built binaries of GCC for HP-UX.
3.231 +
3.232 + On OSF/1 a.k.a. Tru64, some versions of the default C compiler cannot
3.233 +parse its `<wchar.h>' header file. The option `-nodtk' can be used as
3.234 +a workaround. If GNU CC is not installed, it is therefore recommended
3.235 +to try
3.236 +
3.237 + ./configure CC="cc"
3.238 +
3.239 +and if that doesn't work, try
3.240 +
3.241 + ./configure CC="cc -nodtk"
3.242 +
3.243 + On Solaris, don't put `/usr/ucb' early in your `PATH'. This
3.244 +directory contains several dysfunctional programs; working variants of
3.245 +these programs are available in `/usr/bin'. So, if you need `/usr/ucb'
3.246 +in your `PATH', put it _after_ `/usr/bin'.
3.247 +
3.248 + On Haiku, software installed for all users goes in `/boot/common',
3.249 +not `/usr/local'. It is recommended to use the following options:
3.250 +
3.251 + ./configure --prefix=/boot/common
3.252 +
3.253 +Specifying the System Type
3.254 +==========================
3.255 +
3.256 + There may be some features `configure' cannot figure out
3.257 +automatically, but needs to determine by the type of machine the package
3.258 +will run on. Usually, assuming the package is built to be run on the
3.259 +_same_ architectures, `configure' can figure that out, but if it prints
3.260 +a message saying it cannot guess the machine type, give it the
3.261 +`--build=TYPE' option. TYPE can either be a short name for the system
3.262 +type, such as `sun4', or a canonical name which has the form:
3.263 +
3.264 + CPU-COMPANY-SYSTEM
3.265 +
3.266 +where SYSTEM can have one of these forms:
3.267 +
3.268 + OS
3.269 + KERNEL-OS
3.270 +
3.271 + See the file `config.sub' for the possible values of each field. If
3.272 +`config.sub' isn't included in this package, then this package doesn't
3.273 +need to know the machine type.
3.274 +
3.275 + If you are _building_ compiler tools for cross-compiling, you should
3.276 +use the option `--target=TYPE' to select the type of system they will
3.277 +produce code for.
3.278 +
3.279 + If you want to _use_ a cross compiler, that generates code for a
3.280 +platform different from the build platform, you should specify the
3.281 +"host" platform (i.e., that on which the generated programs will
3.282 +eventually be run) with `--host=TYPE'.
3.283 +
3.284 +Sharing Defaults
3.285 +================
3.286 +
3.287 + If you want to set default values for `configure' scripts to share,
3.288 +you can create a site shell script called `config.site' that gives
3.289 +default values for variables like `CC', `cache_file', and `prefix'.
3.290 +`configure' looks for `PREFIX/share/config.site' if it exists, then
3.291 +`PREFIX/etc/config.site' if it exists. Or, you can set the
3.292 +`CONFIG_SITE' environment variable to the location of the site script.
3.293 +A warning: not all `configure' scripts look for a site script.
3.294 +
3.295 +Defining Variables
3.296 +==================
3.297 +
3.298 + Variables not defined in a site shell script can be set in the
3.299 +environment passed to `configure'. However, some packages may run
3.300 +configure again during the build, and the customized values of these
3.301 +variables may be lost. In order to avoid this problem, you should set
3.302 +them in the `configure' command line, using `VAR=value'. For example:
3.303 +
3.304 + ./configure CC=/usr/local2/bin/gcc
3.305 +
3.306 +causes the specified `gcc' to be used as the C compiler (unless it is
3.307 +overridden in the site shell script).
3.308 +
3.309 +Unfortunately, this technique does not work for `CONFIG_SHELL' due to
3.310 +an Autoconf bug. Until the bug is fixed you can use this workaround:
3.311 +
3.312 + CONFIG_SHELL=/bin/bash /bin/bash ./configure CONFIG_SHELL=/bin/bash
3.313 +
3.314 +`configure' Invocation
3.315 +======================
3.316 +
3.317 + `configure' recognizes the following options to control how it
3.318 +operates.
3.319 +
3.320 +`--help'
3.321 +`-h'
3.322 + Print a summary of all of the options to `configure', and exit.
3.323 +
3.324 +`--help=short'
3.325 +`--help=recursive'
3.326 + Print a summary of the options unique to this package's
3.327 + `configure', and exit. The `short' variant lists options used
3.328 + only in the top level, while the `recursive' variant lists options
3.329 + also present in any nested packages.
3.330 +
3.331 +`--version'
3.332 +`-V'
3.333 + Print the version of Autoconf used to generate the `configure'
3.334 + script, and exit.
3.335 +
3.336 +`--cache-file=FILE'
3.337 + Enable the cache: use and save the results of the tests in FILE,
3.338 + traditionally `config.cache'. FILE defaults to `/dev/null' to
3.339 + disable caching.
3.340 +
3.341 +`--config-cache'
3.342 +`-C'
3.343 + Alias for `--cache-file=config.cache'.
3.344 +
3.345 +`--quiet'
3.346 +`--silent'
3.347 +`-q'
3.348 + Do not print messages saying which checks are being made. To
3.349 + suppress all normal output, redirect it to `/dev/null' (any error
3.350 + messages will still be shown).
3.351 +
3.352 +`--srcdir=DIR'
3.353 + Look for the package's source code in directory DIR. Usually
3.354 + `configure' can determine that directory automatically.
3.355 +
3.356 +`--prefix=DIR'
3.357 + Use DIR as the installation prefix. *note Installation Names::
3.358 + for more details, including other options available for fine-tuning
3.359 + the installation locations.
3.360 +
3.361 +`--no-create'
3.362 +`-n'
3.363 + Run the configure checks, but stop before creating any output
3.364 + files.
3.365 +
3.366 +`configure' also accepts some other, not widely useful, options. Run
3.367 +`configure --help' for more details.
3.368 +
4.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
4.2 +++ b/Makefile.am Tue Jan 24 23:54:05 2012 +0000
4.3 @@ -0,0 +1,1 @@
4.4 +SUBDIRS=gclib gutcheck test doc
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/README Tue Jan 24 23:54:05 2012 +0000
5.3 @@ -0,0 +1,68 @@
5.4 + gutcheck
5.5 + ========
5.6 +
5.7 +General installation instructions can be found in INSTALL. The following
5.8 +aim to give a quick overview and some help for specific systems. Documentation
5.9 +for gutcheck itself can be found in doc/gutcheck.txt and for the test
5.10 +framework in doc/gc-test.txt.
5.11 +
5.12 +Linux
5.13 +-----
5.14 +
5.15 +You should be able to use the standard:
5.16 +
5.17 +% ./configure
5.18 +% make
5.19 +% sudo make install
5.20 +
5.21 +If you get an error about no package 'glib-2.0' found, then you need to
5.22 +install the development package for glib2. Under Fedora, RHEL and friends
5.23 +that would be:
5.24 +
5.25 +% sudo yum install gcc pkgconfig glib2-devel
5.26 +
5.27 +Under Debian, Ubuntu and friends that would be:
5.28 +
5.29 +% sudo apt-get install gcc pkgconfig glib2-devel
5.30 +
5.31 +If you get really stuck, you can use the --without-glib option to configure,
5.32 +but this may well not be supported in a future version so this is probably
5.33 +best avoided.
5.34 +
5.35 +Microsoft Windows
5.36 +-----------------
5.37 +
5.38 +It should be possible to use MSYS (http://www.mingw.org/wiki/MSYS) to build
5.39 +on a Windows machine. You'll need a copy of the development package for
5.40 +glib and its dependencies from http://www.gtk.org/download/win32.php.
5.41 +
5.42 +It's much easier to build using a cross-compiler from Linux, if you have
5.43 +access to such a system. Under Fedora, RHEL and friends you can do this
5.44 +with:
5.45 +
5.46 +% sudo yum install mingw32-gcc pkgconfig mingw32-glib2-static \
5.47 + mingw32-gettext-static mingw32-iconv-static
5.48 +% ./configure --host=i686-w64-mingw32 --disable-shared \
5.49 + --bindir=/gutcheck --datadir=/
5.50 +% make
5.51 +% mkdir build
5.52 +% make install DESTDIR=`pwd`/build
5.53 +
5.54 +The contents of the build/gutcheck directory can then be copied to a
5.55 +Microsoft Windows machine.
5.56 +
5.57 +Depending on the version of mingw32-gcc you use, you may need to specify a
5.58 +different host type. If you're not sure look and see what the cross-compiler
5.59 +is called (eg., i686-pc-mingw32-gcc) and use the prefix as the host type.
5.60 +
5.61 +Mac
5.62 +---
5.63 +
5.64 +I think this should be quite similar to Linux, doing something like this:
5.65 +
5.66 +% sudo port install gcc pkgconfig glib2-devel
5.67 +% ./configure
5.68 +% make
5.69 +% sudo make install
5.70 +
5.71 +It may also be possible to use fink instead of macports.
6.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
6.2 +++ b/bootstrap.sh Tue Jan 24 23:54:05 2012 +0000
6.3 @@ -0,0 +1,6 @@
6.4 +#!/bin/sh
6.5 +mkdir -p config
6.6 +aclocal && \
6.7 + libtoolize && \
6.8 + automake --foreign --add-missing && \
6.9 + autoconf
7.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
7.2 +++ b/configure.ac Tue Jan 24 23:54:05 2012 +0000
7.3 @@ -0,0 +1,94 @@
7.4 +# -*- Autoconf -*-
7.5 +# Process this file with autoconf to produce a configure script.
7.6 +
7.7 +AC_INIT([gutcheck],[1.50],[ali@juiblex.co.uk])
7.8 +AC_PREREQ(2.59)
7.9 +AC_CONFIG_AUX_DIR([config])
7.10 +AC_CONFIG_SRCDIR([gutcheck/gutcheck.c])
7.11 +AC_CONFIG_FILES([Makefile
7.12 +gclib/Makefile
7.13 +gutcheck/Makefile
7.14 +test/Makefile
7.15 +test/harness/Makefile
7.16 +test/compatibility/Makefile
7.17 +doc/Makefile
7.18 +])
7.19 +AM_INIT_AUTOMAKE(no-define)
7.20 +AC_CANONICAL_HOST
7.21 +
7.22 +##################################################
7.23 +# Checks for programs.
7.24 +##################################################
7.25 +AC_PROG_CC
7.26 +LT_INIT
7.27 +# Libtool supports a --disable-shared option to tell it to avoid
7.28 +# building shared versions of libraries. We don't have any libraries
7.29 +# but we do want to support building static versions of our executables.
7.30 +# Libtool can do this (under the right circumstances) so we overload
7.31 +# this switch for this purpose.
7.32 +#
7.33 +# The libtool option that we use (-static-libtool-libs) means to use
7.34 +# static linking with libraries that supply a .la file and which
7.35 +# include a non-empty value for "old_library". If the library doesn't
7.36 +# include a .la file (they are deleted by some distributions), then
7.37 +# this option will have no effect and likewise if old_library is set
7.38 +# to '' (eg., if the library was build with --disable-static) then
7.39 +# again -static-libtool-libs will have no effect.
7.40 +#
7.41 +# If old_library is set to a non-empty value, then specifying
7.42 +# -static-libtool-libs will cause the link to fail if the old library
7.43 +# cannot be found (libtool will not fallback to a shared library
7.44 +# in these circumstances). This can happen with Fedora, for example,
7.45 +# if a main mingw32 library package is installed but not the
7.46 +# coresponding static sub-package. The solution is to either
7.47 +# install the relevant static sub-packages or don't use --disable-shared.
7.48 +AS_IF([test "$enable_shared" = no],[
7.49 + LDFLAGS="$LDFLAGS -static-libtool-libs"
7.50 +])
7.51 +PKG_PROG_PKG_CONFIG
7.52 +
7.53 +##################################################
7.54 +# Checks for header files.
7.55 +##################################################
7.56 +
7.57 +##################################################
7.58 +# Checks for typedefs, structures, and compiler characteristics.
7.59 +##################################################
7.60 +
7.61 +##################################################
7.62 +# Checks for libraries.
7.63 +##################################################
7.64 +AC_MSG_CHECKING([whether to use glib])
7.65 +AC_ARG_WITH([glib],[AS_HELP_STRING([--without-glib],
7.66 + [use internal re-invented wheel rather than glib2])])
7.67 +AS_IF([test "$with_glib" != no],[
7.68 + AC_MSG_RESULT([yes])
7.69 + PKG_CHECK_MODULES([GLIB],[glib-2.0])
7.70 + AC_DEFINE([HAVE_GLIB],[1],[Define if you have glib version 2.x])
7.71 +],[
7.72 + AC_MSG_RESULT([no])
7.73 +])
7.74 +AM_CONDITIONAL([HAVE_GLIB],[test "$with_glib" != no])
7.75 +
7.76 +# NOTE: If we are using a static version of glib then we
7.77 +# should define GLIB_STATIC_COMPILATION. This isn't needed
7.78 +# when glib is built only for static use (in which case
7.79 +# glibconfig.h will already define GLIB_STATIC_COMPILATION).
7.80 +# It's not easy to tell if libtool will actually link with
7.81 +# a static glib but luckily we don't currently need to;
7.82 +# this pre-processor define only affects the behaviour of
7.83 +# libraries which use glib and we don't have any.
7.84 +
7.85 +##################################################
7.86 +# Checks for library functions.
7.87 +##################################################
7.88 +AC_CHECK_FUNCS_ONCE([mkstemp])
7.89 +
7.90 +##################################################
7.91 +# Checks for processor independent files.
7.92 +##################################################
7.93 +
7.94 +##################################################
7.95 +# Generate the various configured files
7.96 +##################################################
7.97 +AC_OUTPUT
8.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
8.2 +++ b/doc/Makefile.am Tue Jan 24 23:54:05 2012 +0000
8.3 @@ -0,0 +1,3 @@
8.4 +dist_pkgdata_DATA=gutcheck.txt gc-test.txt
8.5 +
8.6 +EXTRA_DIST=README-0.99
9.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
9.2 +++ b/doc/README-0.99 Tue Jan 24 23:54:05 2012 +0000
9.3 @@ -0,0 +1,24 @@
9.4 +RELEASE NOTES FOR GUTCHECK 0.99 20051105
9.5 +----------------------------------------
9.6 +
9.7 +This is the README file for Gutcheck.
9.8 +
9.9 +Gutcheck is a command-line tool for finding problems in
9.10 +files for submission to Project Gutenberg.
9.11 +
9.12 +You should have received the following files:
9.13 +
9.14 + GUTCHECK.EXE MS-DOS Executable
9.15 + gutcheck.txt Documentation
9.16 + gutcheck.c Source code
9.17 + gutcheck.typ A sample typo file
9.18 + README This file
9.19 + COPYING A copy of the GNU GPL licence
9.20 +
9.21 +This program is free software, without warranty of any kind,
9.22 +licensed under the GNU GPL. A copy of the GNU GPL, entitled
9.23 +'COPYING' should be present. If not, you can find one at
9.24 +http://www.fsf.org.
9.25 +
9.26 +Gutcheck was written by Jim Tinsley, who can be reached at
9.27 +jtinsley@pobox.com, or via http://gutcheck.sourceforge.net.
10.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
10.2 +++ b/doc/gc-test.txt Tue Jan 24 23:54:05 2012 +0000
10.3 @@ -0,0 +1,64 @@
10.4 + gutcheck test framework
10.5 + =======================
10.6 +
10.7 +Running existing testcases
10.8 +--------------------------
10.9 +
10.10 +The test harness (the program that runs a test) is called gc-test. The various
10.11 +testcases are stored in multiple text files, typically with a .tst extension.
10.12 +
10.13 +To run a testcase when all of gutcheck, gc-test and the testcase file are
10.14 +in the current directory simply do something like:
10.15 +
10.16 +% gc-test missing-space.tst
10.17 +
10.18 +from a command prompt. Under MS-Windows, this is called a command window and
10.19 +the prompt will normally look slightly different, eg.,
10.20 +
10.21 +C:\DP> gc-test missing-space.tst
10.22 +
10.23 +To run all the tests in the current directory, do something like this:
10.24 +
10.25 +% gc-test *.tst
10.26 +
10.27 +If gutcheck is not in the current directory, then you can set an environment
10.28 +variable (GUTCHECK) to point at it. For example, on MS-Windows you might do:
10.29 +
10.30 +C:\DP> set GUTCHECK=C:\GUTCHECK\GUTCHECK.EXE
10.31 +C:\DP> gc-test *.tst
10.32 +
10.33 +Writing your own testcases
10.34 +--------------------------
10.35 +
10.36 +Writing a new testcase is pretty painless. Most testcases follow this simple
10.37 +pattern:
10.38 +
10.39 + ┌──────────────────────────────────────────â”
10.40 + │**************** INPUT **************** │
10.41 + │"Look!John, over there!" │
10.42 + │**************** EXPECTED ****************│
10.43 + │ │
10.44 + │"Look!John, over there!" │
10.45 + │ Line 1 column 6 - Missing space? │
10.46 + └──────────────────────────────────────────┘
10.47 +
10.48 +The sixteen asterisks in this example form what is known as the "flag". This
10.49 +flag must come before and after all tags (eg., INPUT and EXPECTED). In the
10.50 +unlikely event that you need sixteen asterisks at the start of line of text,
10.51 +then simply choose a different flag and use it throughout the file (flags
10.52 +can be any sequence of ASCII characters except control codes and space).
10.53 +
10.54 +Note that the header that gutcheck normally outputs is not included in the
10.55 +expected output. This avoids problems with not knowing beforehand the name
10.56 +of the file that gutcheck will be asked to look at (and saves typing!).
10.57 +gutcheck prints a blank line before each warning. These are not part of the
10.58 +header and so do need to be included.
10.59 +
10.60 +To test that gutcheck produces no output, you still need to include
10.61 +an EXPECTED tag, just with no text following it. If there is no EXPECTED
10.62 +tag, then gc-test will consider that no expectation exists and won't check
10.63 +the output at all.
10.64 +
10.65 +There is no support yet for non-ASCII testcases, embedded linefeeds,
10.66 +passing command line options to gutcheck or for testcases which are
10.67 +expected to fail.
11.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
11.2 +++ b/doc/gutcheck.txt Tue Jan 24 23:54:05 2012 +0000
11.3 @@ -0,0 +1,742 @@
11.4 +
11.5 +
11.6 + Gutcheck documentation
11.7 +
11.8 +
11.9 +gutcheck: lists possible common formatting errors in a Project
11.10 +Gutenberg candidate file. It is a command line program and can be used
11.11 +under Win32 or Unix (gutcheck.c should compile anywhere; if it doesn't,
11.12 +tell me). For Windows-only people, there is an appendix at the end
11.13 +with brief instructions for running it.
11.14 +
11.15 +
11.16 +Current version: 0.99. Users of 0.98 see end of file for changes.
11.17 +
11.18 +You should also have received the licence file COPYING, a README file,
11.19 +gutcheck.c, the source code, and gutcheck.exe, a DOS executable, with
11.20 +this file.
11.21 +
11.22 +This software is Copyright Jim Tinsley 2000-2005.
11.23 +
11.24 +Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.
11.25 +This is Free Software; you may redistribute it under certain conditions (GPL).
11.26 +
11.27 +See http://gutcheck.sourceforge.net for the latest version.
11.28 +
11.29 +
11.30 +Usage is: gutcheck [-setopxlywm] filename
11.31 + where:
11.32 + -s checks Single quotes
11.33 + -e switches off Echoing of lines
11.34 + -t checks Typos
11.35 + -o produces an Overview only
11.36 + -p sets strict quotes checking for Paragraphs
11.37 + -x (paranoid) switches OFF typo checking and extra checks
11.38 + -l turns off Line-end checks
11.39 + -y sets error messages to stdout
11.40 + -w is a special mode for web uploads (for future use)
11.41 + -v (verbose) forces individual reporting of minor problems
11.42 + -m interprets Markup of some common HTML tags and entities
11.43 + -u warns about words in a user-defined typo file gutcheck.typ
11.44 + -d ignores some DP-specific markup
11.45 +
11.46 +Running gutcheck without any parameters will display a brief help message.
11.47 +
11.48 +Sample usage:
11.49 +
11.50 + gutcheck warpeace.txt
11.51 +
11.52 +
11.53 +More detail:
11.54 +
11.55 + Echoing lines (-e to switch off)
11.56 +
11.57 + You may find it convenient, when reviewing Gutcheck's
11.58 + suggestions, to see the line that Gutcheck is questioning.
11.59 + That way, you can often see at a glance whether it is
11.60 + a real error that needs to be fixed, or a false positive
11.61 + that should be in the text, but Gutcheck's limited
11.62 + programming doesn't understand.
11.63 +
11.64 + By default, gutcheck echoes these lines, but if you don't
11.65 + want to see the lines referred to, -e will switch it OFF.
11.66 +
11.67 +
11.68 + Quotes (-s and -p switches)
11.69 +
11.70 + Gutcheck always looks for unbalanced doublequotes in a
11.71 + paragraph. It is a common convention for writers not to
11.72 + close quotes in a paragraph if the next paragraph opens
11.73 + with quotes and is a continuation by the same speaker.
11.74 +
11.75 + Gutcheck therefore does not normally report unclosed quotes
11.76 + if the next paragraph begins with a quote. If you need
11.77 + to see all unclosed quotes, even where the next paragraph
11.78 + begins with a quote, you should use the -p switch.
11.79 +
11.80 + Singlequotes (') are a problem, since the same character
11.81 + is used for an apostrophe. I'm not sure that it is
11.82 + possible to get 100% accuracy on singlequotes checking,
11.83 + particularly since dialect, quite common in PG texts,
11.84 + upsets the normal rules so badly. Consider the sentence:
11.85 + 'Tis often said that a man's a man for a' that.
11.86 + As humans, we recognize that both apostrophes are used
11.87 + for contractions rather than quotes, but it isn't easy
11.88 + to get a program to recognize that.
11.89 +
11.90 + Since Gutcheck makes too many mistakes when trying to match
11.91 + singlequotes, it doesn't look for unbalanced singlequotes
11.92 + unless you specify the -s switch.
11.93 +
11.94 + Consider these sentences, which illustrate the main cases:
11.95 +
11.96 + 'Tis often said that a fool and his money are soon parted.
11.97 +
11.98 + 'Becky's goin' home,' said Tom.
11.99 +
11.100 + The dogs' tails wagged in unison.
11.101 +
11.102 + Those 'pack dogs' of yours look more like wolves.
11.103 +
11.104 +
11.105 +
11.106 + Typos (-t switch)
11.107 +
11.108 + It's not Gutcheck's job to be a spelling checker, but it
11.109 + does check for a list of common typos and OCR errors if you
11.110 + use the -t switch. (The -x switch also turns typo checking on.)
11.111 +
11.112 + It also checks for character combinations, especially involving
11.113 + h and b, which are often confused by OCR, that rarely or never
11.114 + occur. For example, it queries "tbe" in a word. Now, "the" often
11.115 + occurs, but "tbe" is very rare (heartbeat, hotbed), so I'm
11.116 + playing the odds - a few false positives for many errors found.
11.117 + Similarly with "ii", which is a very common OCR error.
11.118 +
11.119 + Gutcheck suppresses multiple reporting of the first 40 "typos"
11.120 + found. This is to remove the annoyance of seeing something like
11.121 + "FN" (footnote) or "LK" (initials) flagged as a typo 147 times
11.122 + in a text.
11.123 +
11.124 +
11.125 + Line-end checking (-l switch to disable)
11.126 +
11.127 + All PG texts should have a Carriage Return (CR - character 13)
11.128 + and a Line Feed (LF - character 10) at end of each line,
11.129 + regardless of what O/S you made them on. DOS/Windows, Unix
11.130 + and Mac have different conventions, but the final text should
11.131 + always use a CR/LF pair as its line terminator.
11.132 +
11.133 + By default, Gutcheck verifies that every line does have
11.134 + the correct terminator, but if you're on a work-in-progress
11.135 + in Linux, you might want to convert the line-ends as a final
11.136 + step, and not want to see thousands of errors every time you
11.137 + run Gutcheck before that final step, so you can turn off
11.138 + this checking with the -l switch.
11.139 +
11.140 +
11.141 + Paranoid mode (-x switch to disable: Trust No One :-)
11.142 +
11.143 + -x switches OFF typo-checking, the -t flag, automatically
11.144 + and some extra checks like standalone 1 and 0 queries.
11.145 +
11.146 +
11.147 + Overview mode (-o switch)
11.148 +
11.149 + This mode just gives a count of queries found
11.150 + instead of a detailed list.
11.151 +
11.152 +
11.153 + Header quote (-h switch)
11.154 +
11.155 + If you use the -h switch, gutcheck will also display
11.156 + the Title, Author, Release and Edition fields from the
11.157 + PG header. This is useful mostly for the automated
11.158 + checks we do on recently-posted texts.
11.159 +
11.160 +
11.161 + Errors to stdout (-y switch)
11.162 +
11.163 + If you're just running gutcheck normally, you can ignore
11.164 + this. It's only there for programs that provide a front
11.165 + end to gutcheck. It makes error messages appear within
11.166 + the output of gutcheck so that the front end knows whether
11.167 + gutcheck ran OK.
11.168 +
11.169 +
11.170 + Verbose reporting (-v switch)
11.171 +
11.172 + Normally, if gutcheck sees lots of long lines, short lines,
11.173 + spaced dashes, non-ASCII characters or dot-commas ".," it
11.174 + assumes these are features of the text, counts and summarizes
11.175 + them at the top of its report, but does not list them
11.176 + individually. If the -v switch is on, gutcheck will list them all.
11.177 +
11.178 +
11.179 + Markup interpretation (-m switch)
11.180 +
11.181 + Normally, gutcheck flags anything it suspects of being HTML
11.182 + markup as a possible error. When you use the -m switch,
11.183 + however, it matches anything that looks like markup against
11.184 + a short list of common HTML tags and entities. If the markup
11.185 + is in that list, it either ignores the markup, in the case
11.186 + of a tag, or "interprets" the markup as its nearest ASCII
11.187 + equivalent, in the case of an entity. So, for example, using
11.188 + this switch, gutcheck will "see"
11.189 +
11.190 + “He went <i>thataway!</i>”
11.191 +
11.192 + as
11.193 +
11.194 + "He went thataway!"
11.195 +
11.196 + and report accordingly.
11.197 +
11.198 + This switch does not, not, NOT check the validity of HTML;
11.199 + it exists so that you can run gutcheck on most HTML texts
11.200 + for PG, and get sane results. It does not support all tags.
11.201 + It does not support all entities. When it sees a tag or entity
11.202 + it does not recognize, it will query it as HTML just as if
11.203 + you hadn't specified the -m switch.
11.204 +
11.205 + Gutcheck 0.99 will automatically switch on markup interpretation
11.206 + if it sees a lot of tags that appear to be markup, so mostly, you
11.207 + won't have to specify this.
11.208 +
11.209 + User-defined typos (-u switch)
11.210 +
11.211 + If you have a file named gutcheck.typ either in your current
11.212 + working directory or in the directory from which you explicitly
11.213 + invoked gutcheck, but not necessarily on your path, and if you
11.214 + specify the -u switch, gutcheck will query any word specified
11.215 + in that file. The file is simple: one word, in lower case, per
11.216 + line. 999 lines are allowed for. Be careful not to put multiple
11.217 + words onto a line, or leave any rubbish other than the word on
11.218 + the line. You should have received a sample file gutcheck.typ
11.219 + with this package.
11.220 +
11.221 + Ignore DP markup (-d switch)
11.222 +
11.223 + Distributed Proofreaders (http://www.pgdp.net) is currently
11.224 + (2005) the main source of PG texts, and proofers there use
11.225 + special conventions. This switch understands those conventions,
11.226 + so that people can use gutcheck on files in process that still
11.227 + haven't had the special conventions removed yet. The special
11.228 + conventions supported in 0.99 are page-separators and
11.229 + "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/".
11.230 +
11.231 +
11.232 +You will probably only run gutcheck on a text once or maybe twice,
11.233 +just prior to uploading; it usually finds a few formatting problems;
11.234 +it also usually finds queries that aren't problems at all - it often
11.235 +questions Tables of Contents for having short lines, for example.
11.236 +These are called "false positives", and need a human to decide on
11.237 +them.
11.238 +
11.239 +The text should be standard prose, and already close to PG normal
11.240 +format (plain text, about 70 characters per line with blank lines
11.241 +between paragraphs).
11.242 +
11.243 +Gutcheck merely draws your attention to things that might be errors.
11.244 +It is NOT a substitute for human judgement. Formatting choices like
11.245 +short lines may be for a reason that this program can't understand.
11.246 +
11.247 +Even the most careful human proofing can leave errors behind in a
11.248 +text, and there are several automated checks you can do to help find
11.249 +them. Of these, spellchecking (with _very_ careful human judgement) is
11.250 +the most important and most useful.
11.251 +
11.252 +Gutcheck does perform some basic typo-checking if you ask it to,
11.253 +but its focus is on formatting errors specific to PG texts -
11.254 +mismatched quotes, non-ASCII characters, bad spacing, bad line
11.255 +length, HTML tags perhaps left from a conversion, unbalanced
11.256 +brackets.
11.257 +
11.258 +Suggestions for additional checks would be appreciated and duly
11.259 +considered, but no guarantees that they will be implemented.
11.260 +
11.261 +
11.262 +
11.263 +
11.264 + How do _I_ use it?
11.265 +
11.266 +Practically everyone I give gutcheck to asks me how _I_ use it.
11.267 +Well, when I get a text for posting, say filename.txt, I run
11.268 +
11.269 + gutcheck -o filename.txt
11.270 +
11.271 +That gives me a quick idea what I'm dealing with. It'll tell
11.272 +me what kind of problems gutcheck sees, and give me an idea
11.273 +of how much more work needs to be done on the text. Keep in
11.274 +mind that gutcheck doesn't do anything like a full spellcheck,
11.275 +but when I see a text that has a lot of problems, I assume that
11.276 +it probably needs a spellcheck too.
11.277 +
11.278 +Having got a feel for the ballpark, I run
11.279 +
11.280 + gutcheck filename.txt > jj
11.281 +
11.282 +where jj is my personal, all-purpose filename for temporary data
11.283 +that doesn't need to be kept. Then I open filename.txt and jj in
11.284 +a split-screen view in my editor, and work down the text, fixing
11.285 +whatever needs fixing, and skipping whatever doesn't. If your
11.286 +editor doesn't split-screen, you can get much the same effect by
11.287 +opening your original file in your normal editor, and jj (or your
11.288 +equivalent name) in something like Notepad, keeping both in view
11.289 +at the same time.
11.290 +
11.291 +Twice a day, an automatic process looks at all recently-posted
11.292 +texts, and emails Michael, me, and sometimes other people with
11.293 +their gutcheck summaries.
11.294 +
11.295 +
11.296 +
11.297 + Future development of gutcheck
11.298 +
11.299 +Gutcheck has gone about as far as it can, given its current
11.300 +structure. In order to add better singlequotes checking,
11.301 +sentence checking, better he/be checking and other good stuff
11.302 +that I'd like to see, I'll have to rewrite it from a different
11.303 +angle - looking at the syntax instead of the lines. And I'll
11.304 +probably get around to that sooner or later.
11.305 +
11.306 +Meantime, I'm just trying to get this version stabilized, so
11.307 +please report any bugs you find. When it is stable, I'll run
11.308 +up a Windows port for those timid souls who can't look a
11.309 +command line in the eye. :-)
11.310 +
11.311 +And I've started work on gutspell, a companion to gutcheck
11.312 +which will concentrate on spelling problems. PG spelling
11.313 +problems are unusual, since the range of texts we cover is
11.314 +so wide, and I'll be taking a somewhat unorthodox approach
11.315 +to writing this spelling-checker _specifically_ for texts
11.316 +containing a lot of dialect and uncommon words that have
11.317 +probably already been spell-checked against a standard
11.318 +modern dictionary.
11.319 +
11.320 +
11.321 +
11.322 +
11.323 +Explanations of common gutcheck messages:
11.324 +
11.325 + --> 74 lines in this file have white space at end
11.326 +
11.327 + PG texts shouldn't have extra white space added at end of line.
11.328 + Don't worry too much about this; they're not doing any harm,
11.329 + and they'll be removed during posting anyway.
11.330 +
11.331 +
11.332 + --> 348 lines in this file are short. Not reporting short lines.
11.333 + --> 84 lines in this file are long. Not reporting long lines.
11.334 + --> 8 lines in this file are VERY long!
11.335 +
11.336 + If there are a lot of long or short lines, Gutcheck won't list
11.337 + them individually. The short lines version of this message
11.338 + is commonly seen when gutchecking poetry and some plays, where
11.339 + the normal line length is shorter than the standard for prose.
11.340 + A "VERY long" line is one over 80 characters. You normally
11.341 + shouldn't have any of these, but sometimes you may have to render
11.342 + a table that must be that long, or some special preformatted
11.343 + quotation that can't be broken.
11.344 +
11.345 +
11.346 + --> There are 75 spaced dashes and em-dashes in this file. Not reporting them.
11.347 +
11.348 + The PG standard for an emdash--like these--is two minus signs
11.349 + with no spaces before or after them. However, some older texts
11.350 + used spaced dashes - like these -- and if there are very many
11.351 + such spaced dashes in the file, gutcheck just draws your
11.352 + attention to it and doesn't list them individually.
11.353 +
11.354 +
11.355 +
11.356 + Line 3020 - Non-ASCII character 233
11.357 +
11.358 + Standard PG texts should use only ASCII characters with values
11.359 + up to 127; however, non-English, accented characters can be
11.360 + represented according to several different non-ASCII encoding
11.361 + schemes, using values over 127. If you have a plain English text
11.362 + with a few accented characters in words like cafe or tete-a-tete,
11.363 + you should replace the accented characters with their unaccented
11.364 + versions. The English pound sign is another commonly-seen
11.365 + non-ASCII character. If you have enough non-ASCII characters in
11.366 + your text that you feel removing them would degrade your text
11.367 + unacceptably, you should probably consider doing an 8-bit text
11.368 + as well as a plain-ASCII version.
11.369 +
11.370 +
11.371 +
11.372 + Line 1207 - Non-ISO-8859 character 156
11.373 +
11.374 + Even in "8-bit" texts, there are distinctions between code sets.
11.375 + The ISO-8859 family of 8-bit code sets is the most commonly used
11.376 + in PG, and these sets do not define values in the range 128 through
11.377 + 159 as printable characters. It's quite common for someone on a
11.378 + Windows or Mac machine to use a non-ISO character inadvertently,
11.379 + so this message warns that the character is not only not ASCII,
11.380 + but also outside the ISO-8859 range.
11.381 +
11.382 +
11.383 +
11.384 + Line 46 - Tab character?
11.385 +
11.386 + Some editors and WPs will put in Tab characters (character 9) to
11.387 + indicate indented text. You should not use these in a PG text,
11.388 + because you can't be sure how they will appear on a reader's
11.389 + screen. Find the Tab, and replace it with the appropriate number
11.390 + of spaces.
11.391 +
11.392 +
11.393 + Line 1327 - Tilde character?
11.394 +
11.395 + The tilde character (~) might be legitimately used, but it's the
11.396 + character commonly used by OCR software to indicate a place where
11.397 + it couldn't make out the letter, so gutcheck flags it.
11.398 +
11.399 +
11.400 +
11.401 + Line 1347 - Asterisk?
11.402 +
11.403 + Asterisks are reported only in paranoid mode (see -x).
11.404 + Like tildes, they are often used to indicate errors, but they are
11.405 + also legitimately used as line delimiters and footnote markers.
11.406 +
11.407 +
11.408 +
11.409 + Line 1451 - Long line 129
11.410 +
11.411 + PG texts should have lines shorter than 76. There may be occasions
11.412 + where you decide that you really have to go out to 79 characters,
11.413 + but the sample above says that line 1451 is 129 characters long -
11.414 + probably two lines run together.
11.415 +
11.416 +
11.417 +
11.418 + Line 1590 - Short line?
11.419 +
11.420 + PG texts should have lines longer than 54 characters. However,
11.421 + there are special cases like poetry and tables of contents where
11.422 + the lines _should_ be shorter. So treat Gutcheck warnings about
11.423 + short lines carefully. Sometimes it's a genuine formatting
11.424 + problem; sometimes the line really needs to be short.
11.425 +
11.426 + Hint: gutcheck will not flag lines as short if they are indented
11.427 + - if they start with a space. I like to start inserted stanzas
11.428 + and other such items indented with a couple of spaces so that
11.429 + they stand out from the main text anyway.
11.430 +
11.431 +
11.432 +
11.433 + Line 1804 - Begins with punctuation?
11.434 +
11.435 + Lines should normally not begin with commas, periods and so on.
11.436 + An exception is ellipses . . . which can happen at start of line.
11.437 +
11.438 +
11.439 +
11.440 + Line 1850 - Spaced em-dash?
11.441 +
11.442 + The PG standard for an em-dash--like these--is two minus signs
11.443 + with no spaces before or after them. Gutcheck flags non-PG
11.444 + em-dashes - like this one. Normally, you will replace it with a
11.445 + PG-standard em-dash.
11.446 +
11.447 +
11.448 +
11.449 + Line 1904 - Query he/be error?
11.450 +
11.451 + Gutcheck makes a very minor effort to look for that scourge of all
11.452 + proofreaders, "be" replacing "he" or vice-versa, and draws your
11.453 + attention to it when it thinks it has found one.
11.454 +
11.455 +
11.456 +
11.457 + Line 2017 - Query digit in a1most
11.458 +
11.459 + The digit 1 is commonly OCRed for the letter l, the digit 0 for
11.460 + the letter O, and so on. When gutcheck sees a mix of digits and
11.461 + letters, it warns you. It may generate a false positive for
11.462 + something like 7am.
11.463 +
11.464 +
11.465 +
11.466 + Line 2083 - Query standalone 0
11.467 +
11.468 + In paranoid mode (see -x) only, gutcheck warns about the digit 0
11.469 + and the number 1 standing alone as a word. This can happen if the
11.470 + OCR misreads the words O or I.
11.471 +
11.472 +
11.473 +
11.474 + Line 2115 - Query word whetber
11.475 +
11.476 + If you have switched typo-checking on, gutcheck looks for
11.477 + potential typos, especially common h/b errors. It's not
11.478 + infallible; it sometimes queries legit words, but it's
11.479 + always worth taking a look.
11.480 +
11.481 +
11.482 +
11.483 + Line 2190 column 14 - Missing space?
11.484 +
11.485 + Omitting a space is a very common error,especially coming from
11.486 + OCRed text,and can be hard for a human to spot. The commas in
11.487 + the previous sentence illustrate the kind of thing I mean.
11.488 +
11.489 +
11.490 +
11.491 + Line 2240 column 48 - Spaced punctuation?
11.492 +
11.493 + The flip side of the "missing space" error , here , is when extra
11.494 + spaces are added before punctuation . Some old texts appear to add
11.495 + extra spaces around punctuation consistently, but this was a
11.496 + typographical convention rather than the author's intent, and the
11.497 + extra "spaces" should be removed when preparing a PG text.
11.498 +
11.499 +
11.500 +
11.501 + Line 2301 column 19 - Unspaced quotes?
11.502 +
11.503 + Another common spacing problem occurs in a phrase like "You wait
11.504 + there,"he said.
11.505 +
11.506 +
11.507 +
11.508 + Line 2385 column 27 - Wrongspaced quotes?
11.509 +
11.510 + As of version 0.98, gutcheck adds extra checks on whether a quote
11.511 + seems to be a start or end quote, and queries those that appear to
11.512 + be misplaced. This does give rise to false positives when quotes are
11.513 + nested, for example:
11.514 +
11.515 + "And how," she asked, "will your "friends" help you now?"
11.516 +
11.517 + but these false positives are worth it because of the many cases
11.518 + that this test catches, notably those like:
11.519 +
11.520 + "And how, "she said," will your friends help you now?"
11.521 +
11.522 + Sometimes a "wrongspaced quotes" query will arise because an earlier
11.523 + quote in the paragraph was omitted, so if the place specified seems
11.524 + to be OK, look back to see whether there's a problem in the preceding
11.525 + lines.
11.526 +
11.527 +
11.528 +
11.529 + Line 2400 - HTML Tag? <PRE>
11.530 +
11.531 + Some PG texts have been converted from HTML, and not all of the
11.532 + HTML tags have been removed.
11.533 +
11.534 +
11.535 +
11.536 + Line 2402 - HTML symbol? &emdash;
11.537 +
11.538 + Similarly, special HTML symbol characters can survive into PG
11.539 + texts. Can occasionally produce amusing false positives like
11.540 + . . . Marwick & Co were well known for it;
11.541 +
11.542 +
11.543 +
11.544 + Line 2540 - Mismatched quotes
11.545 +
11.546 + Another gutcheck mainstay - unclosed doublequotes in a paragraph.
11.547 + See the discussion of quotes in the switches section near the
11.548 + start of this file.
11.549 +
11.550 + Since the mismatch doesn't occur on any one line, gutcheck quotes
11.551 + the line number of the first blank line following the paragraph,
11.552 + since this is the point where it reconciles the count of quotes.
11.553 + However, if gutcheck is echoing lines, that is, you haven't used
11.554 + the -e switch, it will show the _first_ line of the paragraph,
11.555 + to help you find the place without using line numbers. The
11.556 + offending paragraph is therefore between the quoted line and
11.557 + the line number given.
11.558 +
11.559 +
11.560 +
11.561 + Line 2587 - Mismatched single quotes
11.562 +
11.563 + Only checked with the -s switch, since checking single quotes is
11.564 + not a very reliable process. Otherwise, the same logic as for
11.565 + doublequotes applies.
11.566 +
11.567 +
11.568 +
11.569 + Line 2877 - Mismatched round brackets?
11.570 +
11.571 + Also curly and square brackets. Texts with a lot of brackets, like
11.572 + plays with bracketed stage instructions, may have mismatches.
11.573 +
11.574 +
11.575 + Line 3150 - No CR?
11.576 + Line 3204 - Two successive CRs?
11.577 + Line 3281 position 75 - CR without LF?
11.578 +
11.579 + These are the invalid line-end warnings. See the discussion of
11.580 + line-end checking in the switches section near the start of this
11.581 + file. If you see these, and your editor doesn't show anything
11.582 + wrong, you should probably try deleting the characters just before
11.583 + and after the line end, and the line-end itself, then retyping the
11.584 + characters and the line-end.
11.585 +
11.586 +
11.587 + Line 2940 - Paragraph starts with lower-case
11.588 +
11.589 + A common error in an e-text is for an extra blank line
11.590 +
11.591 + to be put in, like the blank line above, and this often
11.592 + shows up as a new paragraph beginning with lower case.
11.593 + Sometimes the blank line is deliberate, as when a
11.594 + quotation is inserted in a speech. Use your judgement.
11.595 +
11.596 +
11.597 + Line 2987 - Extra period?
11.598 +
11.599 + An extra period. is a. common problem in OCRed text. and usually
11.600 + arises when a speck of dust on the page is mistaken for a period.
11.601 + or. as occasionally happens. when a comma loses its tail.
11.602 +
11.603 +
11.604 + Line 3012 column 12 - Double punctuation?
11.605 +
11.606 + Double punctuation., like that,, is a common typo and
11.607 + scanno. Some books have much legit double punctuation,
11.608 + like etc., etc., but it's worth checking anyway.
11.609 +
11.610 +
11.611 +
11.612 + * * * *
11.613 +
11.614 +For Windows-only users who are unfamiliar with DOS:
11.615 +
11.616 + If you're a Windows-only user, you need to save
11.617 + gutcheck.exe into the folder (directory) where the
11.618 + text file you want to check is. Let's say your
11.619 + text file is in C:\GUT, then you should save
11.620 + GUTCHECK.EXE into C:\GUT.
11.621 +
11.622 + Now get to a DOS prompt. You can do this by
11.623 + selecting the "Command Prompt" or "MS-DOS Prompt"
11.624 + option that will be somewhere on your
11.625 + Start/Programs menu.
11.626 +
11.627 + Now get into the C:\GUT directory.
11.628 + You can do this using the CD (change directory)
11.629 + command, like this:
11.630 + CD \GUT
11.631 + and your prompt will change to
11.632 + C:\GUT>
11.633 + so you know you're in the right place.
11.634 +
11.635 + Now type
11.636 + gutcheck yourfile.txt
11.637 + and you'll see gutcheck's report
11.638 +
11.639 + By default, gutcheck prints its queries to screen.
11.640 + If you want to create a file of them, to edit
11.641 + against the text, you can use the greater-than
11.642 + sign (>) to tell it to output the report to a
11.643 + file. For example, if you want its report in a
11.644 + file called QUERIES.LST, you could type
11.645 +
11.646 + gutcheck yourfile.txt > queries.lst
11.647 +
11.648 + The queries.lst file will then contain the listing
11.649 + of possible formatting errors, and you can
11.650 + edit it alongside your text.
11.651 +
11.652 + Whatever you do, DON'T make the filename after
11.653 + the greater-than sign the name of a file already
11.654 + on your disk that you want to keep, because
11.655 + the greater-than sign will cause gutcheck to
11.656 + replace any existing file of that name.
11.657 +
11.658 + So, for example, if you have two Tolstoy files
11.659 + that you want to check, called WARPEACE.TXT and
11.660 + ANNAK.TXT, make sure that neither of these names
11.661 + is ever used following the greater-than sign.
11.662 + To check these correctly, you might do:
11.663 +
11.664 + gutcheck warpeace.txt >war.lst
11.665 +
11.666 + and
11.667 +
11.668 + gutcheck annak.txt > annak.lst
11.669 +
11.670 + separately. Then you can look at war.lst and annak.lst
11.671 + to see the gutcheck reports.
11.672 +
11.673 + * * * *
11.674 +
11.675 +
11.676 +For existing 0.98 users upgrading to 0.99:
11.677 +
11.678 + If you run on old 16-bit DOS or Windows 3.x, I'm afraid
11.679 + you're out of luck. I'm not saying it _can't_ be compiled
11.680 + to run on 16-bit, but the executable with the package is
11.681 + for Win32 only. *nix users won't notice the change at all.
11.682 +
11.683 +
11.684 + There are two new switches: -u and -d.
11.685 + See above for full rundown.
11.686 +
11.687 +
11.688 +Here's a list of the new errors:
11.689 +
11.690 + Line 1456 - Carat character?
11.691 +
11.692 + I^ve found a few.
11.693 +
11.694 +
11.695 + Line 1821 - Forward slash?
11.696 +
11.697 + Common error for italicized "I", or so /'ve found.
11.698 +
11.699 +
11.700 + Line 2139 - Query missing paragraph break?
11.701 +
11.702 + "Come here, son." "Do I _have_ to go, dad?"
11.703 + Like that. False positives in some texts. Sorry 'bout that,
11.704 + but these are often errors.
11.705 +
11.706 +
11.707 + Line 2200 - Query had/bad error?
11.708 +
11.709 + Clear enough. Doesn't catch as many as I'd like it to,
11.710 + but rarely gives false alarms.
11.711 +
11.712 +
11.713 + Line 2268 - Query punctuation after the?
11.714 +
11.715 + Some words, like "the", very rarely have punctuation
11.716 + following them. Others, like "Mrs", usually have a
11.717 + period, but never a comma. Occasional false positives.
11.718 +
11.719 +
11.720 + Line 2380 - Query possible scanno arid
11.721 +
11.722 + It found one of your user-defined typos when you
11.723 + used the -u switch.
11.724 +
11.725 +
11.726 + Line 2511 - Capital "S"?
11.727 +
11.728 + Surprisingly common specific case, like: Jane'S
11.729 +
11.730 +
11.731 + Line 3469 - endquote missing punctuation?
11.732 +
11.733 + OK. This one can really cause a lot of false positives
11.734 + in some books, but it switches itself off if it finds
11.735 + more than 20 in a text, unless you force it to list them
11.736 + all with the -v switch.
11.737 + "Hey, dad" Johnny said, "can we go now?"
11.738 + is a common punctuation-missing error.
11.739 +
11.740 +
11.741 + Line 4266 - Mismatched underscores?
11.742 +
11.743 + Like mismatched anything else!
11.744 +
11.745 +
12.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
12.2 +++ b/gclib/Makefile.am Tue Jan 24 23:54:05 2012 +0000
12.3 @@ -0,0 +1,10 @@
12.4 +INCLUDES=-I$(top_srcdir)
12.5 +AM_CFLAGS=$(GLIB_CFLAGS)
12.6 +LIBS=$(GLIB_LIBS)
12.7 +
12.8 +noinst_LTLIBRARIES=libgc.la
12.9 +libgc_la_SOURCES=gclib.h textfileutils.c textfileutils.h spawn.c spawn.h
12.10 +if !HAVE_GLIB
12.11 +libgc_la_SOURCES+=macros.h types.h fileutils.c fileutils.h mem.c mem.h \
12.12 + strfuncs.c strfuncs.h gcstring.c gcstring.h utils.c utils.h
12.13 +endif
13.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
13.2 +++ b/gclib/fileutils.c Tue Jan 24 23:54:05 2012 +0000
13.3 @@ -0,0 +1,46 @@
13.4 +#include <stdlib.h>
13.5 +#include <stdio.h>
13.6 +#include <gclib/macros.h>
13.7 +#include <gclib/mem.h>
13.8 +#include <gclib/fileutils.h>
13.9 +#include <gclib/gcstring.h>
13.10 +
13.11 +/*
13.12 + * Read a file into memory (which should be freed with mem_free when no
13.13 + * longer required). Returns FALSE on error and outputs a suitable error
13.14 + * message to stderr.
13.15 + */
13.16 +boolean file_get_contents(const char *filename,char **contents,size_t *length)
13.17 +{
13.18 + FILE *fp;
13.19 + size_t n;
13.20 + char *buffer;
13.21 + String *string;
13.22 + fp=fopen(filename,"rb");
13.23 + if (!fp)
13.24 + {
13.25 + perror(filename);
13.26 + return FALSE;
13.27 + }
13.28 + buffer=mem_new(char,1024);
13.29 + string=string_new(NULL);
13.30 + do
13.31 + {
13.32 + n=fread(buffer,1,1024,fp);
13.33 + if (n<0)
13.34 + {
13.35 + perror(filename);
13.36 + string_free(string,TRUE);
13.37 + mem_free(buffer);
13.38 + free(fp);
13.39 + return FALSE;
13.40 + }
13.41 + string_append_len(string,buffer,n);
13.42 + } while(n);
13.43 + mem_free(buffer);
13.44 + if (length)
13.45 + *length=string->len;
13.46 + *contents=string_free(string,FALSE);
13.47 + fclose(fp);
13.48 + return TRUE;
13.49 +}
14.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
14.2 +++ b/gclib/fileutils.h Tue Jan 24 23:54:05 2012 +0000
14.3 @@ -0,0 +1,8 @@
14.4 +#ifndef GC_FILEUTILS_H
14.5 +#define GC_FILEUTILS_H
14.6 +
14.7 +#include <gclib/types.h>
14.8 +
14.9 +boolean file_get_contents(const char *filename,char **contents,size_t *length);
14.10 +
14.11 +#endif /* GC_FILEUTILS_H */
15.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
15.2 +++ b/gclib/gclib.h Tue Jan 24 23:54:05 2012 +0000
15.3 @@ -0,0 +1,36 @@
15.4 +#if HAVE_GLIB
15.5 +
15.6 +#include <glib.h>
15.7 +#define GC_DIR_SEPARATOR G_DIR_SEPARATOR
15.8 +#define GC_DIR_SEPARATOR_S G_DIR_SEPARATOR_S
15.9 +#define GC_IS_DIR_SEPARATOR(c) G_IS_DIR_SEPARATOR(c)
15.10 +#define boolean gboolean
15.11 +#define String GString
15.12 +#define mem_new0 g_new0
15.13 +#define mem_free g_free
15.14 +#define str_dup g_strdup
15.15 +#define str_ndup g_strndup
15.16 +#define path_get_basename g_path_get_basename
15.17 +#define file_get_contents(filename,contents,length) \
15.18 + g_file_get_contents(filename,contents,length,NULL)
15.19 +#define string_new g_string_new
15.20 +#define string_append g_string_append
15.21 +#define string_append_len g_string_append_len
15.22 +#define string_append_c g_string_append_c
15.23 +#define string_free g_string_free
15.24 +#define string_set_size g_string_set_size
15.25 +
15.26 +#else /* !HAVE_GLIB */
15.27 +
15.28 +#include <gclib/macros.h>
15.29 +#include <gclib/types.h>
15.30 +#include <gclib/mem.h>
15.31 +#include <gclib/fileutils.h>
15.32 +#include <gclib/strfuncs.h>
15.33 +#include <gclib/gcstring.h>
15.34 +#include <gclib/utils.h>
15.35 +
15.36 +#endif /* HAVE_GLIB */
15.37 +
15.38 +#include <gclib/textfileutils.h>
15.39 +#include <gclib/spawn.h>
16.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
16.2 +++ b/gclib/gcstring.c Tue Jan 24 23:54:05 2012 +0000
16.3 @@ -0,0 +1,90 @@
16.4 +#include <stdlib.h>
16.5 +#include <string.h>
16.6 +#include <gclib/gcstring.h>
16.7 +#include <gclib/types.h>
16.8 +#include <gclib/mem.h>
16.9 +#include <gclib/strfuncs.h>
16.10 +
16.11 +/*
16.12 + * Strings which manage their own memory
16.13 + */
16.14 +
16.15 +String *string_new(const char *init)
16.16 +{
16.17 + String *string=mem_new(String,1);
16.18 + if (!init)
16.19 + init="";
16.20 + string->len=strlen(init);
16.21 + string->alloc=string->len+1;
16.22 + string->str=str_dup(init);
16.23 + return string;
16.24 +}
16.25 +
16.26 +/*
16.27 + * Free a string and either return the contents (if free_segment is FALSE)
16.28 + * or free the contents as well and return NULL (if free_segment is TRUE).
16.29 + */
16.30 +char *string_free(String *string,boolean free_segment)
16.31 +{
16.32 + char *retval;
16.33 + if (free_segment)
16.34 + {
16.35 + mem_free(string->str);
16.36 + retval=NULL;
16.37 + }
16.38 + else
16.39 + retval=string->str;
16.40 + mem_free(string);
16.41 + return retval;
16.42 +}
16.43 +
16.44 +/*
16.45 + * Append a byte to string.
16.46 + */
16.47 +void string_append_c(String *string,char c)
16.48 +{
16.49 + if (string->len+1==string->alloc)
16.50 + {
16.51 + string->alloc*=2;
16.52 + string->str=mem_renew(char,string->str,string->alloc);
16.53 + }
16.54 + string->str[string->len++]=c;
16.55 + string->str[string->len]='\0';
16.56 +}
16.57 +
16.58 +/*
16.59 + * Append len bytes from s to string. len may be passed as <0 if s is
16.60 + * a nul-terminated string of unknown length.
16.61 + */
16.62 +void string_append_len(String *string,const char *s,ssize_t len)
16.63 +{
16.64 + if (len<0)
16.65 + len=strlen(s);
16.66 + if (string->len+len>=string->alloc)
16.67 + {
16.68 + while (string->len+len>=string->alloc)
16.69 + string->alloc*=2;
16.70 + string->str=mem_renew(char,string->str,string->alloc);
16.71 + }
16.72 + memcpy(string->str+string->len,s,len);
16.73 + string->len+=len;
16.74 + string->str[string->len]='\0';
16.75 +}
16.76 +
16.77 +/*
16.78 + * Sets the length of a String. If the length is less than the current length,
16.79 + * the string will be truncated. If the length is greater than the current
16.80 + * length, the contents of the newly added area are undefined. (However, as
16.81 + * always, string->str[string->len] will be a nul byte.)
16.82 + */
16.83 +void string_set_size(String *string,size_t len)
16.84 +{
16.85 + if (len>=string->alloc)
16.86 + {
16.87 + while (len>=string->alloc)
16.88 + string->alloc*=2;
16.89 + string->str=mem_renew(char,string->str,string->alloc);
16.90 + }
16.91 + string->len=len;
16.92 + string->str[string->len]='\0';
16.93 +}
17.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
17.2 +++ b/gclib/gcstring.h Tue Jan 24 23:54:05 2012 +0000
17.3 @@ -0,0 +1,18 @@
17.4 +#ifndef GC_STRING_H
17.5 +#define GC_STRING_H
17.6 +
17.7 +#include <unistd.h>
17.8 +#include <gclib/types.h>
17.9 +
17.10 +typedef struct {
17.11 + char *str;
17.12 + size_t alloc,len;
17.13 +} String;
17.14 +
17.15 +String *string_new(const char *init);
17.16 +char *string_free(String *string,boolean free_segment);
17.17 +void string_append_c(String *string,char c);
17.18 +void string_append_len(String *string,const char *s,ssize_t len);
17.19 +#define string_append(string,s) string_append_len(string,s,-1)
17.20 +
17.21 +#endif /* GC_STRING_H */
18.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
18.2 +++ b/gclib/macros.h Tue Jan 24 23:54:05 2012 +0000
18.3 @@ -0,0 +1,7 @@
18.4 +#ifndef FALSE
18.5 +#define FALSE 0
18.6 +#endif
18.7 +
18.8 +#ifndef TRUE
18.9 +#define TRUE (!FALSE)
18.10 +#endif
19.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
19.2 +++ b/gclib/mem.c Tue Jan 24 23:54:05 2012 +0000
19.3 @@ -0,0 +1,54 @@
19.4 +#include <stdlib.h>
19.5 +#include <stdio.h>
19.6 +#include <string.h>
19.7 +#include <gclib/mem.h>
19.8 +
19.9 +/*
19.10 + * A memory allocator that aborts on failure (so that the caller never
19.11 + * needs to handle out of memory, which we assume is very unlikely to
19.12 + * happen under normal circumstances on any modern machine).
19.13 + */
19.14 +void *mem_alloc(size_t nmemb,size_t size)
19.15 +{
19.16 + void *ptr=malloc(nmemb*size);
19.17 + if (!ptr)
19.18 + {
19.19 + fprintf(stderr,
19.20 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
19.21 + (unsigned long)nmemb,(unsigned long)size);
19.22 + abort();
19.23 + }
19.24 + return ptr;
19.25 +}
19.26 +
19.27 +/*
19.28 + * As mem_new, but new memory is cleared to zero.
19.29 + */
19.30 +void *mem_alloc0(size_t nmemb,size_t size)
19.31 +{
19.32 + void *ptr=calloc(nmemb,size);
19.33 + if (!ptr)
19.34 + {
19.35 + fprintf(stderr,
19.36 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
19.37 + (unsigned long)nmemb,(unsigned long)size);
19.38 + abort();
19.39 + }
19.40 + return ptr;
19.41 +}
19.42 +
19.43 +/*
19.44 + * Grow or shrink a memory block, aborting on failure.
19.45 + */
19.46 +void *mem_realloc(void *ptr,size_t nmemb,size_t size)
19.47 +{
19.48 + ptr=realloc(ptr,nmemb*size);
19.49 + if (!ptr)
19.50 + {
19.51 + fprintf(stderr,
19.52 + "Not enough memory to allocate %lu elements of %lu bytes.\n",
19.53 + (unsigned long)nmemb,(unsigned long)size);
19.54 + abort();
19.55 + }
19.56 + return ptr;
19.57 +}
20.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
20.2 +++ b/gclib/mem.h Tue Jan 24 23:54:05 2012 +0000
20.3 @@ -0,0 +1,13 @@
20.4 +#ifndef GC_MEM_H
20.5 +#define GC_MEM_H
20.6 +
20.7 +void *mem_alloc(size_t nmemb,size_t size);
20.8 +void *mem_alloc0(size_t nmemb,size_t size);
20.9 +void *mem_realloc(void *ptr,size_t nmemb,size_t size);
20.10 +
20.11 +#define mem_new(type,n) ((type *)mem_alloc(n,sizeof(type)))
20.12 +#define mem_new0(type,n) ((type *)mem_alloc0(n,sizeof(type)))
20.13 +#define mem_renew(type,ptr,n) ((type *)mem_realloc(ptr,n,sizeof(type)))
20.14 +#define mem_free(ptr) free(ptr)
20.15 +
20.16 +#endif /* GC_MEM_H */
21.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
21.2 +++ b/gclib/spawn.c Tue Jan 24 23:54:05 2012 +0000
21.3 @@ -0,0 +1,84 @@
21.4 +#include <stdlib.h>
21.5 +#include <stdio.h>
21.6 +#ifndef WIN32
21.7 +#include <sys/wait.h>
21.8 +#endif
21.9 +#include <gclib/gclib.h>
21.10 +
21.11 +#define SPAWN_BUFSIZE 128
21.12 +
21.13 +boolean spawn_sync(char **argv,char **standard_output,int *exit_status)
21.14 +{
21.15 +/* Don't use g_spawn_sync on WIN32 for now to avoid needing the helper */
21.16 +#if HAVE_GLIB && !defined(WIN32)
21.17 + char *standard_error;
21.18 + GError *error=NULL;
21.19 + gboolean retval;
21.20 + GSpawnFlags flags=G_SPAWN_SEARCH_PATH;
21.21 + if (!standard_output)
21.22 + flags=G_SPAWN_STDOUT_TO_DEV_NULL;
21.23 + retval=g_spawn_sync(NULL,argv,NULL,flags,NULL,NULL,standard_output,
21.24 + &standard_error,exit_status,&error);
21.25 + fputs(standard_error,stderr);
21.26 + g_free(standard_error);
21.27 + if (!retval)
21.28 + {
21.29 + fprintf(stderr,"%s\n",error->message);
21.30 + g_error_free(error);
21.31 + }
21.32 + else if (exit_status)
21.33 + *exit_status=WEXITSTATUS(*exit_status);
21.34 + return retval;
21.35 +#else
21.36 + FILE *fp;
21.37 + int i,r;
21.38 + size_t n,len;
21.39 + String *command_line,*string;
21.40 + command_line=string_new(NULL);
21.41 + for(i=0;argv[i];i++)
21.42 + {
21.43 + if (i)
21.44 + string_append_c(command_line,' ');
21.45 + string_append(command_line,argv[i]);
21.46 + }
21.47 + fp=popen(command_line->str,"r");
21.48 + string_free(command_line,TRUE);
21.49 + if (!fp)
21.50 + {
21.51 + perror(command_line->str);
21.52 + return FALSE;
21.53 + }
21.54 + string=string_new(NULL);
21.55 + do
21.56 + {
21.57 + len=string->len;
21.58 + string_set_size(string,len+SPAWN_BUFSIZE);
21.59 + n=fread(string->str+len,1,SPAWN_BUFSIZE,fp);
21.60 + if (n<0)
21.61 + {
21.62 + perror("fread");
21.63 + (void)pclose(fp);
21.64 + string_free(string,TRUE);
21.65 + return FALSE;
21.66 + }
21.67 + string_set_size(string,len+n);
21.68 + } while(n);
21.69 + r=pclose(fp);
21.70 + if (r<0)
21.71 + {
21.72 + perror("pclose");
21.73 + string_free(string,TRUE);
21.74 + return FALSE;
21.75 + }
21.76 + else
21.77 + {
21.78 + if (exit_status)
21.79 + *exit_status=r;
21.80 + if (standard_output)
21.81 + *standard_output=string_free(string,FALSE);
21.82 + else
21.83 + string_free(string,TRUE);
21.84 + return TRUE;
21.85 + }
21.86 +#endif
21.87 +}
22.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
22.2 +++ b/gclib/spawn.h Tue Jan 24 23:54:05 2012 +0000
22.3 @@ -0,0 +1,8 @@
22.4 +#ifndef GC_SPAWN_H
22.5 +#define GC_SPAWN_H
22.6 +
22.7 +#include <gclib/gclib.h>
22.8 +
22.9 +boolean spawn_sync(char **argv,char **standard_output,int *exit_status);
22.10 +
22.11 +#endif /* GC_SPAWN_H */
23.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
23.2 +++ b/gclib/strfuncs.c Tue Jan 24 23:54:05 2012 +0000
23.3 @@ -0,0 +1,26 @@
23.4 +#include <stdlib.h>
23.5 +#include <string.h>
23.6 +#include <gclib/mem.h>
23.7 +#include <gclib/strfuncs.h>
23.8 +
23.9 +/*
23.10 + * Like strndup, but only returns NULL if str is NULL.
23.11 + * Note that this routine copies n bytes rather than n characters.
23.12 + */
23.13 +char *str_ndup(const char *str,size_t n)
23.14 +{
23.15 + char *dup;
23.16 + if (!str)
23.17 + return NULL;
23.18 + dup=mem_alloc0(n+1,1);
23.19 + strncpy(dup,str,n);
23.20 + return dup;
23.21 +}
23.22 +
23.23 +/*
23.24 + * Like strdup, but only returns NULL if str is NULL.
23.25 + */
23.26 +char *str_dup(const char *str)
23.27 +{
23.28 + return str_ndup(str,strlen(str));
23.29 +}
24.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
24.2 +++ b/gclib/strfuncs.h Tue Jan 24 23:54:05 2012 +0000
24.3 @@ -0,0 +1,7 @@
24.4 +#ifndef GC_STRFUNCS_H
24.5 +#define GC_STRFUNCS_H
24.6 +
24.7 +char *str_dup(const char *str);
24.8 +char *str_ndup(const char *str,size_t n);
24.9 +
24.10 +#endif /* GC_STRFUNCS_H */
25.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
25.2 +++ b/gclib/textfileutils.c Tue Jan 24 23:54:05 2012 +0000
25.3 @@ -0,0 +1,33 @@
25.4 +#include <stdlib.h>
25.5 +#include <stdio.h>
25.6 +#include <gclib/gclib.h>
25.7 +
25.8 +/*
25.9 + * Read a file into memory (which should be freed with mem_free when no
25.10 + * longer required). Returns NULL on error and outputs a suitable error
25.11 + * message to stderr.
25.12 + * DOS-style line endings are handled transparently even on platforms which
25.13 + * don't normally use this format.
25.14 + */
25.15 +boolean file_get_contents_text(const char *filename,char **contents,
25.16 + size_t *length)
25.17 +{
25.18 + int i;
25.19 + char *raw;
25.20 + size_t raw_length;
25.21 + String *string;
25.22 + if (!file_get_contents(filename,&raw,&raw_length))
25.23 + return FALSE;
25.24 + string=string_new(NULL);
25.25 + for(i=0;i<raw_length;i++)
25.26 + if (raw[i]!='\r')
25.27 + string_append_c(string,raw[i]);
25.28 + mem_free(raw);
25.29 + if (length)
25.30 + *length=string->len;
25.31 + if (contents)
25.32 + *contents=string_free(string,FALSE);
25.33 + else
25.34 + string_free(string,TRUE);
25.35 + return TRUE;
25.36 +}
26.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
26.2 +++ b/gclib/textfileutils.h Tue Jan 24 23:54:05 2012 +0000
26.3 @@ -0,0 +1,9 @@
26.4 +#ifndef GC_TEXTFILEUTILS_H
26.5 +#define GC_TEXTFILEUTILS_H
26.6 +
26.7 +#include <gclib/gclib.h>
26.8 +
26.9 +boolean file_get_contents_text(const char *filename,char **contents,
26.10 + size_t *length);
26.11 +
26.12 +#endif /* GC_TEXTFILEUTILS_H */
27.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
27.2 +++ b/gclib/types.h Tue Jan 24 23:54:05 2012 +0000
27.3 @@ -0,0 +1,6 @@
27.4 +#ifndef GC_TYPES_H
27.5 +#define GC_TYPES_H
27.6 +
27.7 +typedef int boolean;
27.8 +
27.9 +#endif /* GC_TYPES_H */
28.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
28.2 +++ b/gclib/utils.c Tue Jan 24 23:54:05 2012 +0000
28.3 @@ -0,0 +1,46 @@
28.4 +#include <stdlib.h>
28.5 +#include <string.h>
28.6 +#include <unistd.h>
28.7 +#include <gclib/mem.h>
28.8 +#include <gclib/strfuncs.h>
28.9 +#include <gclib/utils.h>
28.10 +
28.11 +#define is_valid_drive(d) ((d)>='a' && (d)<='z' || (d)>='A' && (d)<='Z')
28.12 +
28.13 +/*
28.14 + * Gets the last component of the filename. If filename ends with a directory
28.15 + * separator it gets the component before the last slash. If filename consists
28.16 + * only of directory separators (and on Windows, possibly a drive letter), a
28.17 + * single separator is returned. If filename is empty, it gets ".".
28.18 + */
28.19 +char *path_get_basename(const char *filename)
28.20 +{
28.21 + ssize_t base,last_nonslash;
28.22 + size_t len;
28.23 + char *retval;
28.24 + if (*filename=='\0')
28.25 + return str_dup(".");
28.26 + last_nonslash=strlen(filename)-1;
28.27 + while (last_nonslash>=0 && GC_IS_DIR_SEPARATOR(filename[last_nonslash]))
28.28 + last_nonslash--;
28.29 + if (last_nonslash<0)
28.30 + /* string only containing slashes */
28.31 + return str_dup(GC_DIR_SEPARATOR_S);
28.32 +#ifdef WIN32
28.33 + if (last_nonslash==1 && is_valid_drive(filename[0]) && filename[1]==':')
28.34 + /* string only containing slashes and a drive */
28.35 + return str_dup(GC_DIR_SEPARATOR_S);
28.36 +#endif
28.37 + base=last_nonslash;
28.38 + while (base>=0 && !GC_IS_DIR_SEPARATOR(filename[base]))
28.39 + base--;
28.40 +#ifdef WIN32
28.41 + if (base==-1 && is_valid_drive(filename[0]) && filename[1] == ':')
28.42 + base=1;
28.43 +#endif
28.44 + len=last_nonslash-base;
28.45 + retval=mem_alloc(len+1,1);
28.46 + memcpy(retval,filename+base+1,len);
28.47 + retval[len]='\0';
28.48 + return retval;
28.49 +}
29.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
29.2 +++ b/gclib/utils.h Tue Jan 24 23:54:05 2012 +0000
29.3 @@ -0,0 +1,16 @@
29.4 +#ifndef GC_UTIL_H
29.5 +#define GC_UTIL_H
29.6 +
29.7 +#ifdef WIN32
29.8 +#define GC_DIR_SEPARATOR '\\'
29.9 +#define GC_DIR_SEPARATOR_S "\\"
29.10 +#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR || (c)=='/')
29.11 +#else
29.12 +#define GC_DIR_SEPARATOR '/'
29.13 +#define GC_DIR_SEPARATOR_S "/"
29.14 +#define GC_IS_DIR_SEPARATOR(c) ((c)==GC_DIR_SEPARATOR)
29.15 +#endif
29.16 +
29.17 +char *path_get_basename(const char *filename);
29.18 +
29.19 +#endif /* GC_UTIL_H */
30.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
30.2 +++ b/gutcheck/Makefile.am Tue Jan 24 23:54:05 2012 +0000
30.3 @@ -0,0 +1,8 @@
30.4 +bin_PROGRAMS=gutcheck
30.5 +pkgdata_DATA=gutcheck.typ
30.6 +
30.7 +gutcheck.typ: gutcheck.typ.in
30.8 + sed 's/$$/\r/' $< > $@
30.9 +
30.10 +EXTRA_DIST=gutcheck.typ.in
30.11 +CLEANFILES=gutcheck.typ
31.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
31.2 +++ b/gutcheck/gutcheck.c Tue Jan 24 23:54:05 2012 +0000
31.3 @@ -0,0 +1,2982 @@
31.4 +/*************************************************************************/
31.5 +/* gutcheck - check for assorted weirdnesses in a PG candidate text file */
31.6 +/* */
31.7 +/* Version 0.991 */
31.8 +/* Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com> */
31.9 +/* */
31.10 +/* This program is free software; you can redistribute it and/or modify */
31.11 +/* it under the terms of the GNU General Public License as published by */
31.12 +/* the Free Software Foundation; either version 2 of the License, or */
31.13 +/* (at your option) any later version. */
31.14 +/* */
31.15 +/* This program is distributed in the hope that it will be useful, */
31.16 +/* but WITHOUT ANY WARRANTY; without even the implied warranty of */
31.17 +/* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the */
31.18 +/* GNU General Public License for more details. */
31.19 +/* */
31.20 +/* You should have received a copy of the GNU General Public License */
31.21 +/* along with this program; if not, write to the */
31.22 +/* Free Software Foundation, Inc., */
31.23 +/* 59 Temple Place, */
31.24 +/* Suite 330, */
31.25 +/* Boston, MA 02111-1307 USA */
31.26 +/* */
31.27 +/* */
31.28 +/* */
31.29 +/* Overview comments: */
31.30 +/* */
31.31 +/* If you're reading this, you're either interested in how to detect */
31.32 +/* formatting errors, or very very bored. */
31.33 +/* */
31.34 +/* Gutcheck is a homebrew formatting checker specifically for */
31.35 +/* spotting common formatting problems in a PG e-text. I typically */
31.36 +/* run it once or twice on a file I'm about to submit; it usually */
31.37 +/* finds a few formatting problems. It also usually finds lots of */
31.38 +/* queries that aren't problems at all; it _really_ doesn't like */
31.39 +/* the standard PG header, for example. It's optimized for straight */
31.40 +/* prose; poetry and non-fiction involving tables tend to trigger */
31.41 +/* false alarms. */
31.42 +/* */
31.43 +/* The code of gutcheck is not very interesting, but the experience */
31.44 +/* of what constitutes a possible error may be, and the best way to */
31.45 +/* illustrate that is by example. */
31.46 +/* */
31.47 +/* */
31.48 +/* Here are some common typos found in PG texts that gutcheck */
31.49 +/* will flag as errors: */
31.50 +/* */
31.51 +/* "Look!John , over there!" */
31.52 +/* <this is a HTML tag> */
31.53 +/* &so is this; */
31.54 +/* Margaret said: " Now you should start for school." */
31.55 +/* Margaret said: "Now you should start for school. (if end of para) */
31.56 +/* The horse is said to he worth a lot. */
31.57 +/* 0K - this'11 make you look close1y. */
31.58 +/* "If you do. you'll regret it!" */
31.59 +/* */
31.60 +/* There are some complications . The extra space left around that */
31.61 +/* period was an error . . . but that ellipsis wasn't. */
31.62 +/* */
31.63 +/* The last line of a paragraph */
31.64 +/* is usually short. */
31.65 +/* */
31.66 +/* This period is an error.But the periods in a.m. aren't. */
31.67 +/* */
31.68 +/* Checks that are do-able but not (well) implemented are: */
31.69 +/* Single-quote chcking. */
31.70 +/* Despite 3 attempts at it, singlequote checking is still */
31.71 +/* crap in gutcheck. It may not be possible without analysis */
31.72 +/* of the whole paragraph. */
31.73 +/* */
31.74 +/*************************************************************************/
31.75 +
31.76 +
31.77 +#include <stdio.h>
31.78 +#include <stdlib.h>
31.79 +#include <string.h>
31.80 +#include <ctype.h>
31.81 +
31.82 +#define MAXWORDLEN 80 /* max length of one word */
31.83 +#define LINEBUFSIZE 2048 /* buffer size for an input line */
31.84 +
31.85 +#define MAX_USER_TYPOS 1000
31.86 +#define USERTYPO_FILE "gutcheck.typ"
31.87 +
31.88 +#ifndef MAX_PATH
31.89 +#define MAX_PATH 16384
31.90 +#endif
31.91 +
31.92 +char aline[LINEBUFSIZE];
31.93 +char prevline[LINEBUFSIZE];
31.94 +
31.95 + /* Common typos. */
31.96 +char *typo[] = { "teh", "th", "og", "fi", "ro", "adn", "yuo", "ot", "fo", "thet", "ane", "nad",
31.97 + "te", "ig", "acn", "ahve", "alot", "anbd", "andt", "awya", "aywa", "bakc", "om",
31.98 + "btu", "byt", "cna", "cxan", "coudl", "dont", "didnt", "couldnt", "wouldnt", "doesnt", "shouldnt", "doign", "ehr",
31.99 + "hmi", "hse", "esle", "eyt", "fitrs", "firts", "foudn", "frmo", "fromt", "fwe", "gaurd", "gerat", "goign",
31.100 + "gruop", "haev", "hda", "hearign", "seeign", "sayign", "herat", "hge", "hsa", "hsi", "hte", "htere",
31.101 + "htese", "htey", "htis", "hvae", "hwich", "idae", "ihs", "iits", "int", "iwll", "iwth", "jsut", "loev",
31.102 + "sefl", "myu", "nkow", "nver", "nwe", "nwo", "ocur", "ohter", "omre", "onyl", "otehr", "otu", "owrk",
31.103 + "owuld", "peice", "peices", "peolpe", "peopel", "perhasp", "perhpas", "pleasent", "poeple", "porblem",
31.104 + "porblems", "rwite", "saidt", "saidh", "saids", "seh", "smae", "smoe", "sohw", "stnad", "stopry",
31.105 + "stoyr", "stpo", "tahn", "taht", "tath", "tehy", "tghe", "tghis", "theri", "theyll", "thgat", "thge",
31.106 + "thier", "thna", "thne", "thnig", "thnigs", "thsi", "thsoe", "thta", "timne", "tirne", "tkae",
31.107 + "tthe", "tyhat", "tyhe", "veyr", "vou", "vour", "vrey", "waht", "wasnt", "awtn", "watn", "wehn", "whic", "whcih",
31.108 + "whihc", "whta", "wihch", "wief", "wiht", "witha", "wiull", "wnat", "wnated", "wnats",
31.109 + "woh", "wohle", "wokr", "woudl", "wriet", "wrod", "wroet", "wroking", "wtih", "wuould", "wya", "yera",
31.110 + "yeras", "yersa", "yoiu", "youve", "ytou", "yuor",
31.111 + /* added h/b words for version 12 - removed a few with "tbe" v.25 */
31.112 + "abead", "ahle", "ahout", "ahove", "altbough", "balf", "bardly", "bas", "bave", "baving", "bebind",
31.113 + "beld", "belp", "belped", "ber", "bere", "bim", "bis", "bome", "bouse", "bowever", "buge", "dehates",
31.114 + "deht", "han", "hecause", "hecome", "heen", "hefore", "hegan", "hegin", "heing",
31.115 + "helieve", "henefit", "hetter", "hetween", "heyond", "hig", "higber", "huild", "huy", "hy", "jobn", "joh",
31.116 + "meanwbile", "memher", "memhers", "numher", "numhers",
31.117 + "perbaps", "prohlem", "puhlic", "witbout",
31.118 + /* and a few more for .18 */
31.119 + "arn", "hin", "hirn", "wrok", "wroked", "amd", "aud", "prornise", "prornised", "modem", "bo",
31.120 + "heside", "chapteb", "chaptee", "se",
31.121 + ""};
31.122 +
31.123 +char *usertypo[MAX_USER_TYPOS];
31.124 +
31.125 + /* Common abbreviations and other OK words not to query as typos. */
31.126 + /* 0.99 last-minute - removed "ms" */
31.127 +char *okword[] = {"mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "hmm", "h'm", "hmmm", "rd", "sh", "br",
31.128 + "pp", "hm", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd", "pompeii","hawaii","hawaiian",
31.129 + "hotbed", "heartbeat", "heartbeats", "outbid", "outbids", "frostbite", "frostbitten",
31.130 + ""};
31.131 +
31.132 + /* Common abbreviations that cause otherwise unexplained periods. */
31.133 +char *abbrev[] = {"cent", "cents", "viz", "vol", "vols", "vid", "ed", "al", "etc", "op", "cit",
31.134 + "deg", "min", "chap", "oz", "mme", "mlle", "mssrs",
31.135 + ""};
31.136 + /* Two-Letter combinations that rarely if ever start words, */
31.137 + /* but are common scannos or otherwise common letter */
31.138 + /* combinations. */
31.139 +char *nostart[] = { "hr", "hl", "cb", "sb", "tb", "wb", "tl",
31.140 + "tn", "rn", "lt", "tj",
31.141 + "" };
31.142 +
31.143 + /* Two-Letter combinations that rarely if ever end words */
31.144 + /* but are common scannos or otherwise common letter */
31.145 + /* combinations */
31.146 +char *noend[] = { "cb", "gb", "pb", "sb", "tb",
31.147 + "wh","fr","br","qu","tw","gl","fl","sw","gr","sl","cl",
31.148 + "iy",
31.149 + ""};
31.150 +
31.151 +char *markup[] = { "a", "b", "big", "blockquote", "body", "br", "center",
31.152 + "col", "div", "em", "font", "h1", "h2", "h3", "h4",
31.153 + "h5", "h6", "head", "hr", "html", "i", "img", "li",
31.154 + "meta", "ol", "p", "pre", "small", "span", "strong",
31.155 + "sub", "sup", "table", "td", "tfoot", "thead", "title",
31.156 + "tr", "tt", "u", "ul",
31.157 + ""};
31.158 +
31.159 +char *DPmarkup[] = { "<sc>", "</sc>", "/*", "*/", "/#", "#/", "/$", "$/", "<tb>",
31.160 + ""}; /* <tb> added .991 */
31.161 +
31.162 +char *nocomma[] = { "the", "it's", "their", "an", "mrs", "a", "our", "that's",
31.163 + "its", "whose", "every", "i'll", "your", "my",
31.164 + "mr", "mrs", "mss", "mssrs", "ft", "pm", "st", "dr", "rd",
31.165 + "pp", "cf", "jr", "sr", "vs", "lb", "lbs", "ltd",
31.166 + "i'm", "during", "let", "toward", "among",
31.167 + ""};
31.168 +
31.169 +
31.170 +char *noperiod[] = { "every", "i'm", "during", "that's", "their", "your", "our", "my", "or",
31.171 + "and", "but", "as", "if", "the", "its", "it's", "until", "than", "whether",
31.172 + "i'll", "whose", "who", "because", "when", "let", "till", "very",
31.173 + "an", "among", "those", "into", "whom", "having", "thence",
31.174 + ""};
31.175 +
31.176 +
31.177 +char vowels[] = "aeiouàáâãäæèéêëìíîïòóôõöùúûü"; /* Carlo's old suggestion, updated .991 */
31.178 +
31.179 +struct {
31.180 + char *htmlent;
31.181 + char *htmlnum;
31.182 + char *textent;
31.183 + } entities[] = { "&", "&", "&",
31.184 + "<", "<", "<",
31.185 + ">", ">", ">",
31.186 + "°", "°", " degrees",
31.187 + "£", "£", "L",
31.188 + """, """, "\"", /* -- quotation mark = APL quote, */
31.189 + "Œ", "Œ", "OE", /* -- latin capital ligature OE, */
31.190 + "œ", "œ", "oe", /* -- latin small ligature oe, U+0153 ISOlat2 --> */
31.191 + "Š", "Š", "S", /* -- latin capital letter S with caron, */
31.192 + "š", "š", "s", /* -- latin small letter s with caron, */
31.193 + "Ÿ", "Ÿ", "Y", /* -- latin capital letter Y with diaeresis, */
31.194 + "ˆ", "ˆ", "", /* -- modifier letter circumflex accent, */
31.195 + "˜", "˜", "~", /* -- small tilde, U+02DC ISOdia --> */
31.196 + " ", " ", " ", /* -- en space, U+2002 ISOpub --> */
31.197 + " ", " ", " ", /* -- em space, U+2003 ISOpub --> */
31.198 + " ", " ", " ", /* -- thin space, U+2009 ISOpub --> */
31.199 + "–", "–", "-", /* -- en dash, U+2013 ISOpub --> */
31.200 + "—", "—", "--", /* -- em dash, U+2014 ISOpub --> */
31.201 + "‘", "‘", "'", /* -- left single quotation mark, */
31.202 + "’", "’", "'", /* -- right single quotation mark, */
31.203 + "‚", "‚", "'", /* -- single low-9 quotation mark, U+201A NEW --> */
31.204 + "“", "“", "\"", /* -- left double quotation mark, */
31.205 + "”", "”", "\"", /* -- right double quotation mark, */
31.206 + "„", "„", "\"", /* -- double low-9 quotation mark, U+201E NEW --> */
31.207 + "‹", "‹", "\"", /* -- single left-pointing angle quotation mark, */
31.208 + "›", "›", "\"", /* -- single right-pointing angle quotation mark, */
31.209 + " ", " ", " ", /* -- no-break space = non-breaking space, */
31.210 + "¡", "¡", "!", /* -- inverted exclamation mark, U+00A1 ISOnum --> */
31.211 + "¢", "¢", "c", /* -- cent sign, U+00A2 ISOnum --> */
31.212 + "£", "£", "L", /* -- pound sign, U+00A3 ISOnum --> */
31.213 + "¤", "¤", "$", /* -- currency sign, U+00A4 ISOnum --> */
31.214 + "¥", "¥", "Y", /* -- yen sign = yuan sign, U+00A5 ISOnum --> */
31.215 + "§", "§", "--", /* -- section sign, U+00A7 ISOnum --> */
31.216 + "¨", "¨", " ", /* -- diaeresis = spacing diaeresis, */
31.217 + "©", "©", "(C) ", /* -- copyright sign, U+00A9 ISOnum --> */
31.218 + "ª", "ª", " ", /* -- feminine ordinal indicator, U+00AA ISOnum --> */
31.219 + "«", "«", "\"", /* -- left-pointing double angle quotation mark */
31.220 + "­", "­", "-", /* -- soft hyphen = discretionary hyphen, */
31.221 + "®", "®", "(R) ", /* -- registered sign = registered trade mark sign, */
31.222 + "¯", "¯", " ", /* -- macron = spacing macron = overline */
31.223 + "°", "°", " degrees", /* -- degree sign, U+00B0 ISOnum --> */
31.224 + "±", "±", "+-", /* -- plus-minus sign = plus-or-minus sign, */
31.225 + "²", "²", "2", /* -- superscript two = superscript digit two */
31.226 + "³", "³", "3", /* -- superscript three = superscript digit three */
31.227 + "´", "´", " ", /* -- acute accent = spacing acute, */
31.228 + "µ", "µ", "m", /* -- micro sign, U+00B5 ISOnum --> */
31.229 + "¶", "¶", "--", /* -- pilcrow sign = paragraph sign, */
31.230 + "¸", "¸", " ", /* -- cedilla = spacing cedilla, U+00B8 ISOdia --> */
31.231 + "¹", "¹", "1", /* -- superscript one = superscript digit one, */
31.232 + "º", "º", " ", /* -- masculine ordinal indicator, */
31.233 + "»", "»", "\"", /* -- right-pointing double angle quotation mark */
31.234 + "¼", "¼", "1/4", /* -- vulgar fraction one quarter */
31.235 + "½", "½", "1/2", /* -- vulgar fraction one half */
31.236 + "¾", "¾", "3/4", /* -- vulgar fraction three quarters */
31.237 + "¿", "¿", "?", /* -- inverted question mark */
31.238 + "À", "À", "A", /* -- latin capital letter A with grave */
31.239 + "Á", "Á", "A", /* -- latin capital letter A with acute, */
31.240 + "Â", "Â", "A", /* -- latin capital letter A with circumflex, */
31.241 + "Ã", "Ã", "A", /* -- latin capital letter A with tilde, */
31.242 + "Ä", "Ä", "A", /* -- latin capital letter A with diaeresis, */
31.243 + "Å", "Å", "A", /* -- latin capital letter A with ring above */
31.244 + "Æ", "Æ", "AE", /* -- latin capital letter AE */
31.245 + "Ç", "Ç", "C", /* -- latin capital letter C with cedilla, */
31.246 + "È", "È", "E", /* -- latin capital letter E with grave, */
31.247 + "É", "É", "E", /* -- latin capital letter E with acute, */
31.248 + "Ê", "Ê", "E", /* -- latin capital letter E with circumflex, */
31.249 + "Ë", "Ë", "E", /* -- latin capital letter E with diaeresis, */
31.250 + "Ì", "Ì", "I", /* -- latin capital letter I with grave, */
31.251 + "Í", "Í", "I", /* -- latin capital letter I with acute, */
31.252 + "Î", "Î", "I", /* -- latin capital letter I with circumflex, */
31.253 + "Ï", "Ï", "I", /* -- latin capital letter I with diaeresis, */
31.254 + "Ð", "Ð", "E", /* -- latin capital letter ETH, U+00D0 ISOlat1 --> */
31.255 + "Ñ", "Ñ", "N", /* -- latin capital letter N with tilde, */
31.256 + "Ò", "Ò", "O", /* -- latin capital letter O with grave, */
31.257 + "Ó", "Ó", "O", /* -- latin capital letter O with acute, */
31.258 + "Ô", "Ô", "O", /* -- latin capital letter O with circumflex, */
31.259 + "Õ", "Õ", "O", /* -- latin capital letter O with tilde, */
31.260 + "Ö", "Ö", "O", /* -- latin capital letter O with diaeresis, */
31.261 + "×", "×", "*", /* -- multiplication sign, U+00D7 ISOnum --> */
31.262 + "Ø", "Ø", "O", /* -- latin capital letter O with stroke */
31.263 + "Ù", "Ù", "U", /* -- latin capital letter U with grave, */
31.264 + "Ú", "Ú", "U", /* -- latin capital letter U with acute, */
31.265 + "Û", "Û", "U", /* -- latin capital letter U with circumflex, */
31.266 + "Ü", "Ü", "U", /* -- latin capital letter U with diaeresis, */
31.267 + "Ý", "Ý", "Y", /* -- latin capital letter Y with acute, */
31.268 + "Þ", "Þ", "TH", /* -- latin capital letter THORN, */
31.269 + "ß", "ß", "sz", /* -- latin small letter sharp s = ess-zed, */
31.270 + "à", "à", "a", /* -- latin small letter a with grave */
31.271 + "á", "á", "a", /* -- latin small letter a with acute, */
31.272 + "â", "â", "a", /* -- latin small letter a with circumflex, */
31.273 + "ã", "ã", "a", /* -- latin small letter a with tilde, */
31.274 + "ä", "ä", "a", /* -- latin small letter a with diaeresis, */
31.275 + "å", "å", "a", /* -- latin small letter a with ring above */
31.276 + "æ", "æ", "ae", /* -- latin small letter ae */
31.277 + "ç", "ç", "c", /* -- latin small letter c with cedilla, */
31.278 + "è", "è", "e", /* -- latin small letter e with grave, */
31.279 + "é", "é", "e", /* -- latin small letter e with acute, */
31.280 + "ê", "ê", "e", /* -- latin small letter e with circumflex, */
31.281 + "ë", "ë", "e", /* -- latin small letter e with diaeresis, */
31.282 + "ì", "ì", "i", /* -- latin small letter i with grave, */
31.283 + "í", "í", "i", /* -- latin small letter i with acute, */
31.284 + "î", "î", "i", /* -- latin small letter i with circumflex, */
31.285 + "ï", "ï", "i", /* -- latin small letter i with diaeresis, */
31.286 + "ð", "ð", "eth", /* -- latin small letter eth, U+00F0 ISOlat1 --> */
31.287 + "ñ", "ñ", "n", /* -- latin small letter n with tilde, */
31.288 + "ò", "ò", "o", /* -- latin small letter o with grave, */
31.289 + "ó", "ó", "o", /* -- latin small letter o with acute, */
31.290 + "ô", "ô", "o", /* -- latin small letter o with circumflex, */
31.291 + "õ", "õ", "o", /* -- latin small letter o with tilde, */
31.292 + "ö", "ö", "o", /* -- latin small letter o with diaeresis, */
31.293 + "÷", "÷", "/", /* -- division sign, U+00F7 ISOnum --> */
31.294 + "ø", "ø", "o", /* -- latin small letter o with stroke, */
31.295 + "ù", "ù", "u", /* -- latin small letter u with grave, */
31.296 + "ú", "ú", "u", /* -- latin small letter u with acute, */
31.297 + "û", "û", "u", /* -- latin small letter u with circumflex, */
31.298 + "ü", "ü", "u", /* -- latin small letter u with diaeresis, */
31.299 + "ý", "ý", "y", /* -- latin small letter y with acute, */
31.300 + "þ", "þ", "th", /* -- latin small letter thorn, */
31.301 + "ÿ", "ÿ", "y", /* -- latin small letter y with diaeresis, */
31.302 + "", "" };
31.303 +
31.304 +/* ---- list of special characters ---- */
31.305 +#define CHAR_SPACE 32
31.306 +#define CHAR_TAB 9
31.307 +#define CHAR_LF 10
31.308 +#define CHAR_CR 13
31.309 +#define CHAR_DQUOTE 34
31.310 +#define CHAR_SQUOTE 39
31.311 +#define CHAR_OPEN_SQUOTE 96
31.312 +#define CHAR_TILDE 126
31.313 +#define CHAR_ASTERISK 42
31.314 +#define CHAR_FORESLASH 47
31.315 +#define CHAR_CARAT 94
31.316 +
31.317 +#define CHAR_UNDERSCORE '_'
31.318 +#define CHAR_OPEN_CBRACK '{'
31.319 +#define CHAR_CLOSE_CBRACK '}'
31.320 +#define CHAR_OPEN_RBRACK '('
31.321 +#define CHAR_CLOSE_RBRACK ')'
31.322 +#define CHAR_OPEN_SBRACK '['
31.323 +#define CHAR_CLOSE_SBRACK ']'
31.324 +
31.325 +
31.326 +
31.327 +
31.328 +
31.329 +/* ---- longest and shortest normal PG line lengths ----*/
31.330 +#define LONGEST_PG_LINE 75
31.331 +#define WAY_TOO_LONG 80
31.332 +#define SHORTEST_PG_LINE 55
31.333 +
31.334 +#define SWITCHES "ESTPXLOYHWVMUD" /* switches:- */
31.335 + /* D - ignore DP-specific markup */
31.336 + /* E - echo queried line */
31.337 + /* S - check single quotes */
31.338 + /* T - check common typos */
31.339 + /* P - require closure of quotes on */
31.340 + /* every paragraph */
31.341 + /* X - "Trust no one" :-) Paranoid! */
31.342 + /* Queries everything */
31.343 + /* L - line end checking defaults on */
31.344 + /* -L turns it off */
31.345 + /* O - overview. Just shows counts. */
31.346 + /* Y - puts errors to stdout */
31.347 + /* instead of stderr */
31.348 + /* H - Echoes header fields */
31.349 + /* M - Ignore markup in < > */
31.350 + /* U - Use file of User-defined Typos*/
31.351 + /* W - Defaults for use on Web upload*/
31.352 + /* V - Verbose - list EVERYTHING! */
31.353 +#define SWITNO 14 /* max number of switch parms */
31.354 + /* - used for defining array-size */
31.355 +#define MINARGS 1 /* minimum no of args excl switches */
31.356 +#define MAXARGS 1 /* maximum no of args excl switches */
31.357 +
31.358 +int pswit[SWITNO]; /* program switches set by SWITCHES */
31.359 +
31.360 +#define ECHO_SWITCH 0
31.361 +#define SQUOTE_SWITCH 1
31.362 +#define TYPO_SWITCH 2
31.363 +#define QPARA_SWITCH 3
31.364 +#define PARANOID_SWITCH 4
31.365 +#define LINE_END_SWITCH 5
31.366 +#define OVERVIEW_SWITCH 6
31.367 +#define STDOUT_SWITCH 7
31.368 +#define HEADER_SWITCH 8
31.369 +#define WEB_SWITCH 9
31.370 +#define VERBOSE_SWITCH 10
31.371 +#define MARKUP_SWITCH 11
31.372 +#define USERTYPO_SWITCH 12
31.373 +#define DP_SWITCH 13
31.374 +
31.375 +
31.376 +
31.377 +long cnt_dquot; /* for overview mode, count of doublequote queries */
31.378 +long cnt_squot; /* for overview mode, count of singlequote queries */
31.379 +long cnt_brack; /* for overview mode, count of brackets queries */
31.380 +long cnt_bin; /* for overview mode, count of non-ASCII queries */
31.381 +long cnt_odd; /* for overview mode, count of odd character queries */
31.382 +long cnt_long; /* for overview mode, count of long line errors */
31.383 +long cnt_short; /* for overview mode, count of short line queries */
31.384 +long cnt_punct; /* for overview mode, count of punctuation and spacing queries */
31.385 +long cnt_dash; /* for overview mode, count of dash-related queries */
31.386 +long cnt_word; /* for overview mode, count of word queries */
31.387 +long cnt_html; /* for overview mode, count of html queries */
31.388 +long cnt_lineend; /* for overview mode, count of line-end queries */
31.389 +long cnt_spacend; /* count of lines with space at end V .21 */
31.390 +long linecnt; /* count of total lines in the file */
31.391 +long checked_linecnt; /* count of lines actually gutchecked V .26 */
31.392 +
31.393 +void proghelp(void);
31.394 +void procfile(char *);
31.395 +
31.396 +#define LOW_THRESHOLD 0
31.397 +#define HIGH_THRESHOLD 1
31.398 +
31.399 +#define START 0
31.400 +#define END 1
31.401 +#define PREV 0
31.402 +#define NEXT 1
31.403 +#define FIRST_OF_PAIR 0
31.404 +#define SECOND_OF_PAIR 1
31.405 +
31.406 +#define MAX_WORDPAIR 1000
31.407 +
31.408 +char running_from[MAX_PATH];
31.409 +
31.410 +int mixdigit(char *);
31.411 +char *getaword(char *, char *);
31.412 +int matchword(char *, char *);
31.413 +char *flgets(char *, int, FILE *, long);
31.414 +void lowerit(char *);
31.415 +int gcisalpha(unsigned char);
31.416 +int gcisdigit(unsigned char);
31.417 +int gcisletter(unsigned char);
31.418 +char *gcstrchr(char *s, char c);
31.419 +void postprocess_for_HTML(char *);
31.420 +char *linehasmarkup(char *);
31.421 +char *losemarkup(char *);
31.422 +int tagcomp(char *, char *);
31.423 +char *loseentities(char *);
31.424 +int isroman(char *);
31.425 +int usertypo_count;
31.426 +void postprocess_for_DP(char *);
31.427 +
31.428 +char wrk[LINEBUFSIZE];
31.429 +
31.430 +/* This is disgustingly lazy, predefining max words & lengths, */
31.431 +/* but now I'm out of 16-bit restrictions, what's a couple of K? */
31.432 +#define MAX_QWORD 50
31.433 +#define MAX_QWORD_LENGTH 40
31.434 +char qword[MAX_QWORD][MAX_QWORD_LENGTH];
31.435 +char qperiod[MAX_QWORD][MAX_QWORD_LENGTH];
31.436 +signed int dupcnt[MAX_QWORD];
31.437 +
31.438 +
31.439 +
31.440 +
31.441 +int main(int argc, char **argv)
31.442 +{
31.443 + char *argsw, *s;
31.444 + int i, switno, invarg;
31.445 + char usertypo_file[MAX_PATH];
31.446 + FILE *usertypofile;
31.447 +
31.448 +
31.449 + if (strlen(argv[0]) < sizeof(running_from))
31.450 + strcpy(running_from, argv[0]); /* save the path to the executable gutcheck */
31.451 +
31.452 + /* find out what directory we're running from */
31.453 + for (s = running_from + strlen(running_from); *s != '/' && *s != '\\' && s >= running_from; s--)
31.454 + *s = 0;
31.455 +
31.456 +
31.457 + switno = strlen(SWITCHES);
31.458 + for (i = switno ; --i >0 ; )
31.459 + pswit[i] = 0; /* initialise switches */
31.460 +
31.461 + /* Standard loop to extract switches. */
31.462 + /* When we come out of this loop, the arguments will be */
31.463 + /* in argv[0] upwards and the switches used will be */
31.464 + /* represented by their equivalent elements in pswit[] */
31.465 + while ( --argc > 0 && **++argv == '-')
31.466 + for (argsw = argv[0]+1; *argsw !='\0'; argsw++)
31.467 + for (i = switno, invarg = 1; (--i >= 0) && invarg == 1 ; )
31.468 + if ((toupper(*argsw)) == SWITCHES[i] ) {
31.469 + invarg = 0;
31.470 + pswit[i] = 1;
31.471 + }
31.472 +
31.473 + pswit[PARANOID_SWITCH] ^= 1; /* Paranoid checking is turned OFF, not on, by its switch */
31.474 +
31.475 + if (pswit[PARANOID_SWITCH]) { /* if running in paranoid mode */
31.476 + pswit[TYPO_SWITCH] = pswit[TYPO_SWITCH] ^ 1; /* force typo checks as well */
31.477 + } /* v.20 removed s and p switches from paranoid mode */
31.478 +
31.479 + pswit[LINE_END_SWITCH] ^= 1; /* Line-end checking is turned OFF, not on, by its switch */
31.480 + pswit[ECHO_SWITCH] ^= 1; /* V.21 Echoing is turned OFF, not on, by its switch */
31.481 +
31.482 + if (pswit[OVERVIEW_SWITCH]) /* just print summary; don't echo */
31.483 + pswit[ECHO_SWITCH] = 0;
31.484 +
31.485 + /* Web uploads - for the moment, this is really just a placeholder */
31.486 + /* until we decide what processing we really want to do on web uploads */
31.487 + if (pswit[WEB_SWITCH]) { /* specific override for web uploads */
31.488 + pswit[ECHO_SWITCH] = 1;
31.489 + pswit[SQUOTE_SWITCH] = 0;
31.490 + pswit[TYPO_SWITCH] = 1;
31.491 + pswit[QPARA_SWITCH] = 0;
31.492 + pswit[PARANOID_SWITCH] = 1;
31.493 + pswit[LINE_END_SWITCH] = 0;
31.494 + pswit[OVERVIEW_SWITCH] = 0;
31.495 + pswit[STDOUT_SWITCH] = 0;
31.496 + pswit[HEADER_SWITCH] = 1;
31.497 + pswit[VERBOSE_SWITCH] = 0;
31.498 + pswit[MARKUP_SWITCH] = 0;
31.499 + pswit[USERTYPO_SWITCH] = 0;
31.500 + pswit[DP_SWITCH] = 0;
31.501 + }
31.502 +
31.503 +
31.504 + if (argc < MINARGS || argc > MAXARGS) { /* check number of args */
31.505 + proghelp();
31.506 + return(1); /* exit */
31.507 + }
31.508 +
31.509 +
31.510 + /* read in the user-defined stealth scanno list */
31.511 +
31.512 + if (pswit[USERTYPO_SWITCH]) { /* ... we were told we had one! */
31.513 + if ((usertypofile = fopen(USERTYPO_FILE, "rb")) == NULL) { /* not in cwd. try gutcheck directory. */
31.514 + strcpy(usertypo_file, running_from);
31.515 + strcat(usertypo_file, USERTYPO_FILE);
31.516 + if ((usertypofile = fopen(usertypo_file, "rb")) == NULL) { /* we ain't got no user typo file! */
31.517 + printf(" --> I couldn't find gutcheck.typ -- proceeding without user typos.\n");
31.518 + }
31.519 + }
31.520 +
31.521 + usertypo_count = 0;
31.522 + if (usertypofile) { /* we managed to open a User Typo File! */
31.523 + if (pswit[USERTYPO_SWITCH]) {
31.524 + while (flgets(aline, LINEBUFSIZE-1, usertypofile, (long)usertypo_count)) {
31.525 + if (strlen(aline) > 1) {
31.526 + if ((int)*aline > 33) {
31.527 + s = malloc(strlen(aline)+1);
31.528 + if (!s) {
31.529 + fprintf(stderr, "gutcheck: cannot get enough memory for user typo file!!\n");
31.530 + exit(1);
31.531 + }
31.532 + strcpy(s, aline);
31.533 + usertypo[usertypo_count] = s;
31.534 + usertypo_count++;
31.535 + if (usertypo_count >= MAX_USER_TYPOS) {
31.536 + printf(" --> Only %d user-defined typos allowed: ignoring the rest\n");
31.537 + break;
31.538 + }
31.539 + }
31.540 + }
31.541 + }
31.542 + }
31.543 + fclose(usertypofile);
31.544 + }
31.545 + }
31.546 +
31.547 +
31.548 +
31.549 +
31.550 + fprintf(stderr, "gutcheck: Check and report on an e-text\n");
31.551 +
31.552 + cnt_dquot = cnt_squot = cnt_brack = cnt_bin = cnt_odd = cnt_long =
31.553 + cnt_short = cnt_punct = cnt_dash = cnt_word = cnt_html = cnt_lineend =
31.554 + cnt_spacend = 0;
31.555 +
31.556 + procfile(argv[0]);
31.557 +
31.558 + if (pswit[OVERVIEW_SWITCH]) {
31.559 + printf(" Checked %ld lines of %ld (head+foot = %ld)\n\n",
31.560 + checked_linecnt, linecnt, linecnt - checked_linecnt);
31.561 + printf(" --------------- Queries found --------------\n");
31.562 + if (cnt_long) printf(" Long lines: %5ld\n",cnt_long);
31.563 + if (cnt_short) printf(" Short lines: %5ld\n",cnt_short);
31.564 + if (cnt_lineend) printf(" Line-end problems: %5ld\n",cnt_lineend);
31.565 + if (cnt_word) printf(" Common typos: %5ld\n",cnt_word);
31.566 + if (cnt_dquot) printf(" Unmatched quotes: %5ld\n",cnt_dquot);
31.567 + if (cnt_squot) printf(" Unmatched SingleQuotes: %5ld\n",cnt_squot);
31.568 + if (cnt_brack) printf(" Unmatched brackets: %5ld\n",cnt_brack);
31.569 + if (cnt_bin) printf(" Non-ASCII characters: %5ld\n",cnt_bin);
31.570 + if (cnt_odd) printf(" Proofing characters: %5ld\n",cnt_odd);
31.571 + if (cnt_punct) printf(" Punctuation & spacing queries: %5ld\n",cnt_punct);
31.572 + if (cnt_dash) printf(" Non-standard dashes: %5ld\n",cnt_dash);
31.573 + if (cnt_html) printf(" Possible HTML tags: %5ld\n",cnt_html);
31.574 + printf("\n");
31.575 + printf(" TOTAL QUERIES %5ld\n",
31.576 + cnt_dquot + cnt_squot + cnt_brack + cnt_bin + cnt_odd + cnt_long +
31.577 + cnt_short + cnt_punct + cnt_dash + cnt_word + cnt_html + cnt_lineend);
31.578 + }
31.579 +
31.580 + return(0);
31.581 +}
31.582 +
31.583 +
31.584 +
31.585 +/* procfile - process one file */
31.586 +
31.587 +void procfile(char *filename)
31.588 +{
31.589 +
31.590 + char *s, *t, *s1, laststart, *wordstart;
31.591 + char inword[MAXWORDLEN], testword[MAXWORDLEN];
31.592 + char parastart[81]; /* first line of current para */
31.593 + FILE *infile;
31.594 + long quot, squot, firstline, alphalen, totlen, binlen,
31.595 + shortline, longline, verylongline, spacedash, emdash,
31.596 + space_emdash, non_PG_space_emdash, PG_space_emdash,
31.597 + footerline, dotcomma, start_para_line, astline, fslashline,
31.598 + standalone_digit, hyphens, htmcount, endquote_count;
31.599 + long spline, nspline;
31.600 + signed int i, j, llen, isemptyline, isacro, isellipsis, istypo, alower,
31.601 + eNon_A, eTab, eTilde, eAst, eFSlash, eCarat;
31.602 + signed int warn_short, warn_long, warn_bin, warn_dash, warn_dotcomma,
31.603 + warn_ast, warn_fslash, warn_digit, warn_hyphen, warn_endquote;
31.604 + unsigned int lastlen, lastblen;
31.605 + signed int s_brack, c_brack, r_brack, c_unders;
31.606 + signed int open_single_quote, close_single_quote, guessquote, dquotepar, squotepar;
31.607 + signed int isnewpara, vowel, consonant;
31.608 + char dquote_err[80], squote_err[80], rbrack_err[80], sbrack_err[80], cbrack_err[80],
31.609 + unders_err[80];
31.610 + signed int qword_index, qperiod_index, isdup;
31.611 + signed int enddash;
31.612 + signed int Dutchcount, isDutch, Frenchcount, isFrench;
31.613 +
31.614 +
31.615 +
31.616 +
31.617 +
31.618 + laststart = CHAR_SPACE;
31.619 + lastlen = lastblen = 0;
31.620 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err = *sbrack_err =
31.621 + *unders_err = *prevline = 0;
31.622 + linecnt = firstline = alphalen = totlen = binlen =
31.623 + shortline = longline = spacedash = emdash = checked_linecnt =
31.624 + space_emdash = non_PG_space_emdash = PG_space_emdash =
31.625 + footerline = dotcomma = start_para_line = astline = fslashline =
31.626 + standalone_digit = hyphens = htmcount = endquote_count = 0;
31.627 + quot = squot = s_brack = c_brack = r_brack = c_unders = 0;
31.628 + i = llen = isemptyline = isacro = isellipsis = istypo = 0;
31.629 + warn_short = warn_long = warn_bin = warn_dash = warn_dotcomma =
31.630 + warn_ast = warn_fslash = warn_digit = warn_endquote = 0;
31.631 + isnewpara = vowel = consonant = enddash = 0;
31.632 + spline = nspline = 0;
31.633 + qword_index = qperiod_index = isdup = 0;
31.634 + *inword = *testword = 0;
31.635 + open_single_quote = close_single_quote = guessquote = dquotepar = squotepar = 0;
31.636 + Dutchcount = isDutch = Frenchcount = isFrench = 0;
31.637 +
31.638 +
31.639 + for (j = 0; j < MAX_QWORD; j++) {
31.640 + dupcnt[j] = 0;
31.641 + for (i = 0; i < MAX_QWORD_LENGTH; i++)
31.642 + qword[i][j] = 0;
31.643 + qperiod[i][j] = 0;
31.644 + }
31.645 +
31.646 +
31.647 + if ((infile = fopen(filename, "rb")) == NULL) {
31.648 + if (pswit[STDOUT_SWITCH])
31.649 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
31.650 + else
31.651 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
31.652 + exit(1);
31.653 + }
31.654 +
31.655 + fprintf(stdout, "\n\nFile: %s\n\n", filename);
31.656 + firstline = shortline = longline = verylongline = 0;
31.657 +
31.658 +
31.659 + /*****************************************************/
31.660 + /* */
31.661 + /* Run a first pass - verify that it's a valid PG */
31.662 + /* file, decide whether to report some things that */
31.663 + /* occur many times in the text like long or short */
31.664 + /* lines, non-standard dashes, and other good stuff */
31.665 + /* I'll doubtless think of later. */
31.666 + /* */
31.667 + /*****************************************************/
31.668 +
31.669 + /*****************************************************/
31.670 + /* V.24 Sigh. Yet Another Header Change */
31.671 + /*****************************************************/
31.672 +
31.673 + while (fgets(aline, LINEBUFSIZE-1, infile)) {
31.674 + while (aline[strlen(aline)-1] == 10 || aline[strlen(aline)-1] == 13 ) aline[strlen(aline)-1] = 0;
31.675 + linecnt++;
31.676 + if (strstr(aline, "*END") && strstr(aline, "SMALL PRINT") && (strstr(aline, "PUBLIC DOMAIN") || strstr(aline, "COPYRIGHT"))) {
31.677 + if (spline)
31.678 + printf(" --> Duplicate header?\n");
31.679 + spline = linecnt + 1; /* first line of non-header text, that is */
31.680 + }
31.681 + if (!strncmp(aline, "*** START", 9) && strstr(aline, "PROJECT GUTENBERG")) {
31.682 + if (nspline)
31.683 + printf(" --> Duplicate header?\n");
31.684 + nspline = linecnt + 1; /* first line of non-header text, that is */
31.685 + }
31.686 + if (spline || nspline) {
31.687 + lowerit(aline);
31.688 + if (strstr(aline, "end") && strstr(aline, "project gutenberg")) {
31.689 + if (strstr(aline, "end") < strstr(aline, "project gutenberg")) {
31.690 + if (footerline) {
31.691 + if (!nspline) /* it's an old-form header - we can detect duplicates */
31.692 + printf(" --> Duplicate footer?\n");
31.693 + else
31.694 + ;
31.695 + }
31.696 + else {
31.697 + footerline = linecnt;
31.698 + }
31.699 + }
31.700 + }
31.701 + }
31.702 + if (spline) firstline = spline;
31.703 + if (nspline) firstline = nspline; /* override with new */
31.704 +
31.705 + if (footerline) continue; /* 0.99+ don't count the boilerplate in the footer */
31.706 +
31.707 + llen = strlen(aline);
31.708 + totlen += llen;
31.709 + for (i = 0; i < llen; i++) {
31.710 + if ((unsigned char)aline[i] > 127) binlen++;
31.711 + if (gcisalpha(aline[i])) alphalen++;
31.712 + if (i > 0)
31.713 + if (aline[i] == CHAR_DQUOTE && isalpha(aline[i-1]))
31.714 + endquote_count++;
31.715 + }
31.716 + if (strlen(aline) > 2
31.717 + && lastlen > 2 && lastlen < SHORTEST_PG_LINE
31.718 + && lastblen > 2 && lastblen > SHORTEST_PG_LINE
31.719 + && laststart != CHAR_SPACE)
31.720 + shortline++;
31.721 +
31.722 + if (*aline) /* fixed line below for 0.96 */
31.723 + if ((unsigned char)aline[strlen(aline)-1] <= CHAR_SPACE) cnt_spacend++;
31.724 +
31.725 + if (strstr(aline, ".,")) dotcomma++;
31.726 + /* 0.98 only count ast lines for ignoring purposes where there is */
31.727 + /* locase text on the line */
31.728 + if (strstr(aline, "*")) {
31.729 + for (s = aline; *s; s++)
31.730 + if (*s >='a' && *s <= 'z')
31.731 + break;
31.732 + if (*s) astline++;
31.733 + }
31.734 + if (strstr(aline, "/"))
31.735 + fslashline++;
31.736 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
31.737 + if (aline[i] == '-' && aline[i-1] != '-') hyphens++;
31.738 +
31.739 + if (llen > LONGEST_PG_LINE) longline++;
31.740 + if (llen > WAY_TOO_LONG) verylongline++;
31.741 +
31.742 + if (strstr(aline, "<") && strstr(aline, ">")) {
31.743 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
31.744 + if (i > 0)
31.745 + htmcount++;
31.746 + if (strstr(aline, "<i>")) htmcount +=4; /* bonus marks! */
31.747 + }
31.748 +
31.749 + /* Check for spaced em-dashes */
31.750 + if (strstr(aline,"--")) {
31.751 + emdash++;
31.752 + if (*(strstr(aline, "--")-1) == CHAR_SPACE ||
31.753 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
31.754 + space_emdash++;
31.755 + if (*(strstr(aline, "--")-1) == CHAR_SPACE &&
31.756 + (*(strstr(aline, "--")+2) == CHAR_SPACE))
31.757 + non_PG_space_emdash++; /* count of em-dashes with spaces both sides */
31.758 + if (*(strstr(aline, "--")-1) != CHAR_SPACE &&
31.759 + (*(strstr(aline, "--")+2) != CHAR_SPACE))
31.760 + PG_space_emdash++; /* count of PG-type em-dashes with no spaces */
31.761 + }
31.762 +
31.763 + for (s = aline; *s;) {
31.764 + s = getaword(s, inword);
31.765 + if (!strcmp(inword, "hij") || !strcmp(inword, "niet"))
31.766 + Dutchcount++;
31.767 + if (!strcmp(inword, "dans") || !strcmp(inword, "avec"))
31.768 + Frenchcount++;
31.769 + if (!strcmp(inword, "0") || !strcmp(inword, "1"))
31.770 + standalone_digit++;
31.771 + }
31.772 +
31.773 + /* Check for spaced dashes */
31.774 + if (strstr(aline," -"))
31.775 + if (*(strstr(aline, " -")+2) != '-')
31.776 + spacedash++;
31.777 + lastblen = lastlen;
31.778 + lastlen = strlen(aline);
31.779 + laststart = aline[0];
31.780 +
31.781 + }
31.782 + fclose(infile);
31.783 +
31.784 +
31.785 + /* now, based on this quick view, make some snap decisions */
31.786 + if (cnt_spacend > 0) {
31.787 + printf(" --> %ld lines in this file have white space at end\n", cnt_spacend);
31.788 + }
31.789 +
31.790 + warn_dotcomma = 1;
31.791 + if (dotcomma > 5) {
31.792 + warn_dotcomma = 0;
31.793 + printf(" --> %ld lines in this file contain '.,'. Not reporting them.\n", dotcomma);
31.794 + }
31.795 +
31.796 + /* if more than 50 lines, or one-tenth, are short, don't bother reporting them */
31.797 + warn_short = 1;
31.798 + if (shortline > 50 || shortline * 10 > linecnt) {
31.799 + warn_short = 0;
31.800 + printf(" --> %ld lines in this file are short. Not reporting short lines.\n", shortline);
31.801 + }
31.802 +
31.803 + /* if more than 50 lines, or one-tenth, are long, don't bother reporting them */
31.804 + warn_long = 1;
31.805 + if (longline > 50 || longline * 10 > linecnt) {
31.806 + warn_long = 0;
31.807 + printf(" --> %ld lines in this file are long. Not reporting long lines.\n", longline);
31.808 + }
31.809 +
31.810 + /* if more than 10 lines contain asterisks, don't bother reporting them V.0.97 */
31.811 + warn_ast = 1;
31.812 + if (astline > 10 ) {
31.813 + warn_ast = 0;
31.814 + printf(" --> %ld lines in this file contain asterisks. Not reporting them.\n", astline);
31.815 + }
31.816 +
31.817 + /* if more than 10 lines contain forward slashes, don't bother reporting them V.0.99 */
31.818 + warn_fslash = 1;
31.819 + if (fslashline > 10 ) {
31.820 + warn_fslash = 0;
31.821 + printf(" --> %ld lines in this file contain forward slashes. Not reporting them.\n", fslashline);
31.822 + }
31.823 +
31.824 + /* if more than 20 lines contain unpunctuated endquotes, don't bother reporting them V.0.99 */
31.825 + warn_endquote = 1;
31.826 + if (endquote_count > 20 ) {
31.827 + warn_endquote = 0;
31.828 + printf(" --> %ld lines in this file contain unpunctuated endquotes. Not reporting them.\n", endquote_count);
31.829 + }
31.830 +
31.831 + /* if more than 15 lines contain standalone digits, don't bother reporting them V.0.97 */
31.832 + warn_digit = 1;
31.833 + if (standalone_digit > 10 ) {
31.834 + warn_digit = 0;
31.835 + printf(" --> %ld lines in this file contain standalone 0s and 1s. Not reporting them.\n", standalone_digit);
31.836 + }
31.837 +
31.838 + /* if more than 20 lines contain hyphens at end, don't bother reporting them V.0.98 */
31.839 + warn_hyphen = 1;
31.840 + if (hyphens > 20 ) {
31.841 + warn_hyphen = 0;
31.842 + printf(" --> %ld lines in this file have hyphens at end. Not reporting them.\n", hyphens);
31.843 + }
31.844 +
31.845 + if (htmcount > 20 && !pswit[MARKUP_SWITCH]) {
31.846 + printf(" --> Looks like this is HTML. Switching HTML mode ON.\n");
31.847 + pswit[MARKUP_SWITCH] = 1;
31.848 + }
31.849 +
31.850 + if (verylongline > 0) {
31.851 + printf(" --> %ld lines in this file are VERY long!\n", verylongline);
31.852 + }
31.853 +
31.854 + /* If there are more non-PG spaced dashes than PG em-dashes, */
31.855 + /* assume it's deliberate */
31.856 + /* Current PG guidelines say don't use them, but older texts do,*/
31.857 + /* and some people insist on them whatever the guidelines say. */
31.858 + /* V.20 removed requirement that PG_space_emdash be greater than*/
31.859 + /* ten before turning off warnings about spaced dashes. */
31.860 + warn_dash = 1;
31.861 + if (spacedash + non_PG_space_emdash > PG_space_emdash) {
31.862 + warn_dash = 0;
31.863 + printf(" --> There are %ld spaced dashes and em-dashes. Not reporting them.\n", spacedash + non_PG_space_emdash);
31.864 + }
31.865 +
31.866 + /* if more than a quarter of characters are hi-bit, bug out */
31.867 + warn_bin = 1;
31.868 + if (binlen * 4 > totlen) {
31.869 + printf(" --> This file does not appear to be ASCII. Terminating. Best of luck with it!\n");
31.870 + exit(1);
31.871 + }
31.872 + if (alphalen * 4 < totlen) {
31.873 + printf(" --> This file does not appear to be text. Terminating. Best of luck with it!\n");
31.874 + exit(1);
31.875 + }
31.876 + if ((binlen * 100 > totlen) || (binlen > 100)) {
31.877 + printf(" --> There are a lot of foreign letters here. Not reporting them.\n");
31.878 + warn_bin = 0;
31.879 + }
31.880 +
31.881 + /* isDutch and isFrench added .991 Feb 06 for Frank, Jeroen, Renald */
31.882 + isDutch = 0;
31.883 + if (Dutchcount > 50) {
31.884 + isDutch = 1;
31.885 + printf(" --> This looks like Dutch - switching off dashes and warnings for 's Middags case.\n");
31.886 + }
31.887 +
31.888 + isFrench = 0;
31.889 + if (Frenchcount > 50) {
31.890 + isFrench = 1;
31.891 + printf(" --> This looks like French - switching off some doublepunct.\n");
31.892 + }
31.893 +
31.894 + if (firstline && footerline)
31.895 + printf(" The PG header and footer appear to be already on.\n");
31.896 + else {
31.897 + if (firstline)
31.898 + printf(" The PG header is on - no footer.\n");
31.899 + if (footerline)
31.900 + printf(" The PG footer is on - no header.\n");
31.901 + }
31.902 + printf("\n");
31.903 +
31.904 + /* V.22 George Davis asked for an override switch to force it to list everything */
31.905 + if (pswit[VERBOSE_SWITCH]) {
31.906 + warn_bin = 1;
31.907 + warn_short = 1;
31.908 + warn_dotcomma = 1;
31.909 + warn_long = 1;
31.910 + warn_dash = 1;
31.911 + warn_digit = 1;
31.912 + warn_ast = 1;
31.913 + warn_fslash = 1;
31.914 + warn_hyphen = 1;
31.915 + warn_endquote = 1;
31.916 + printf(" *** Verbose output is ON -- you asked for it! ***\n");
31.917 + }
31.918 +
31.919 + if (isDutch)
31.920 + warn_dash = 0; /* Frank suggested turning it REALLY off for Dutch */
31.921 +
31.922 + if ((infile = fopen(filename, "rb")) == NULL) {
31.923 + if (pswit[STDOUT_SWITCH])
31.924 + fprintf(stdout, "gutcheck: cannot open %s\n", filename);
31.925 + else
31.926 + fprintf(stderr, "gutcheck: cannot open %s\n", filename);
31.927 + exit(1);
31.928 + }
31.929 +
31.930 + if (footerline > 0 && firstline > 0 && footerline > firstline && footerline - firstline < 100) { /* ugh */
31.931 + printf(" --> I don't really know where this text starts. \n");
31.932 + printf(" There are no reference points.\n");
31.933 + printf(" I'm going to have to report the header and footer as well.\n");
31.934 + firstline=0;
31.935 + }
31.936 +
31.937 +
31.938 +
31.939 + /*****************************************************/
31.940 + /* */
31.941 + /* Here we go with the main pass. Hold onto yer hat! */
31.942 + /* */
31.943 + /*****************************************************/
31.944 +
31.945 + /* Re-init some variables we've dirtied */
31.946 + quot = squot = linecnt = 0;
31.947 + laststart = CHAR_SPACE;
31.948 + lastlen = lastblen = 0;
31.949 +
31.950 + while (flgets(aline, LINEBUFSIZE-1, infile, linecnt+1)) {
31.951 + linecnt++;
31.952 + if (linecnt == 1) isnewpara = 1;
31.953 + if (pswit[DP_SWITCH])
31.954 + if (!strncmp(aline, "-----File: ", 11))
31.955 + continue; // skip DP page separators completely
31.956 + if (linecnt < firstline || (footerline > 0 && linecnt > footerline)) {
31.957 + if (pswit[HEADER_SWITCH]) {
31.958 + if (!strncmp(aline, "Title:", 6))
31.959 + printf(" %s\n", aline);
31.960 + if (!strncmp (aline, "Author:", 7))
31.961 + printf(" %s\n", aline);
31.962 + if (!strncmp(aline, "Release Date:", 13))
31.963 + printf(" %s\n", aline);
31.964 + if (!strncmp(aline, "Edition:", 8))
31.965 + printf(" %s\n\n", aline);
31.966 + }
31.967 + continue; /* skip through the header */
31.968 + }
31.969 + checked_linecnt++;
31.970 + s = aline;
31.971 + isemptyline = 1; /* assume the line is empty until proven otherwise */
31.972 +
31.973 + /* If we are in a state of unbalanced quotes, and this line */
31.974 + /* doesn't begin with a quote, output the stored error message */
31.975 + /* If the -P switch was used, print the warning even if the */
31.976 + /* new para starts with quotes */
31.977 + /* Version .20 - if the new paragraph does start with a quote, */
31.978 + /* but is indented, I was giving a spurious error. Need to */
31.979 + /* check the first _non-space_ character on the line rather */
31.980 + /* than the first character when deciding whether the para */
31.981 + /* starts with a quote. Using *t for this. */
31.982 + t = s;
31.983 + while (*t == ' ') t++;
31.984 + if (*dquote_err)
31.985 + if (*t != CHAR_DQUOTE || pswit[QPARA_SWITCH]) {
31.986 + if (!pswit[OVERVIEW_SWITCH]) {
31.987 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.988 + printf(dquote_err);
31.989 + }
31.990 + else
31.991 + cnt_dquot++;
31.992 + }
31.993 + if (*squote_err) {
31.994 + if (*t != CHAR_SQUOTE && *t != CHAR_OPEN_SQUOTE || pswit[QPARA_SWITCH] || squot) {
31.995 + if (!pswit[OVERVIEW_SWITCH]) {
31.996 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.997 + printf(squote_err);
31.998 + }
31.999 + else
31.1000 + cnt_squot++;
31.1001 + }
31.1002 + squot = 0;
31.1003 + }
31.1004 + if (*rbrack_err) {
31.1005 + if (!pswit[OVERVIEW_SWITCH]) {
31.1006 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.1007 + printf(rbrack_err);
31.1008 + }
31.1009 + else
31.1010 + cnt_brack++;
31.1011 + }
31.1012 + if (*sbrack_err) {
31.1013 + if (!pswit[OVERVIEW_SWITCH]) {
31.1014 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.1015 + printf(sbrack_err);
31.1016 + }
31.1017 + else
31.1018 + cnt_brack++;
31.1019 + }
31.1020 + if (*cbrack_err) {
31.1021 + if (!pswit[OVERVIEW_SWITCH]) {
31.1022 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.1023 + printf(cbrack_err);
31.1024 + }
31.1025 + else
31.1026 + cnt_brack++;
31.1027 + }
31.1028 + if (*unders_err) {
31.1029 + if (!pswit[OVERVIEW_SWITCH]) {
31.1030 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", parastart);
31.1031 + printf(unders_err);
31.1032 + }
31.1033 + else
31.1034 + cnt_brack++;
31.1035 + }
31.1036 +
31.1037 + *dquote_err = *squote_err = *rbrack_err = *cbrack_err =
31.1038 + *sbrack_err = *unders_err = 0;
31.1039 +
31.1040 +
31.1041 + /* look along the line, accumulate the count of quotes, and see */
31.1042 + /* if this is an empty line - i.e. a line with nothing on it */
31.1043 + /* but spaces. */
31.1044 + /* V .12 also if line has just spaces, * and/or - on it, don't */
31.1045 + /* count it, since empty lines with asterisks or dashes to */
31.1046 + /* separate sections are common. */
31.1047 + /* V .15 new single-quote checking - has to be better than the */
31.1048 + /* previous version, but how much better? fingers crossed! */
31.1049 + /* V .20 add period to * and - as characters on a separator line*/
31.1050 + s = aline;
31.1051 + while (*s) {
31.1052 + if (*s == CHAR_DQUOTE) quot++;
31.1053 + if (*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
31.1054 + if (s == aline) { /* at start of line, it can only be an openquote */
31.1055 + if (strncmp(s+2, "tis", 3) && strncmp(s+2, "Tis", 3)) /* hardcode a very common exception! */
31.1056 + open_single_quote++;
31.1057 + }
31.1058 + else
31.1059 + if (gcisalpha(*(s-1)) && gcisalpha(*(s+1)))
31.1060 + ; /* do nothing! - it's definitely an apostrophe, not a quote */
31.1061 + else /* it's outside a word - let's check it out */
31.1062 + if (*s == CHAR_OPEN_SQUOTE || gcisalpha(*(s+1))) { /* it damwell better BE an openquote */
31.1063 + if (strncmp(s+1, "tis", 3) && strncmp(s+1, "Tis", 3)) /* hardcode a very common exception! */
31.1064 + open_single_quote++;
31.1065 + }
31.1066 + else { /* now - is it a closequote? */
31.1067 + guessquote = 0; /* accumulate clues */
31.1068 + if (gcisalpha(*(s-1))) { /* it follows a letter - could be either */
31.1069 + guessquote += 1;
31.1070 + if (*(s-1) == 's') { /* looks like a plural apostrophe */
31.1071 + guessquote -= 3;
31.1072 + if (*(s+1) == CHAR_SPACE) /* bonus marks! */
31.1073 + guessquote -= 2;
31.1074 + }
31.1075 + }
31.1076 + else /* it doesn't have a letter either side */
31.1077 + if (strchr(".?!,;:", *(s-1)) && (strchr(".?!,;: ", *(s+1))))
31.1078 + guessquote += 8; /* looks like a closequote */
31.1079 + else
31.1080 + guessquote += 1;
31.1081 + if (open_single_quote > close_single_quote)
31.1082 + guessquote += 1; /* give it the benefit of some doubt - if a squote is already open */
31.1083 + else
31.1084 + guessquote -= 1;
31.1085 + if (guessquote >= 0)
31.1086 + close_single_quote++;
31.1087 + }
31.1088 +
31.1089 + if (*s != CHAR_SPACE
31.1090 + && *s != '-'
31.1091 + && *s != '.'
31.1092 + && *s != CHAR_ASTERISK
31.1093 + && *s != 13
31.1094 + && *s != 10) isemptyline = 0; /* ignore lines like * * * as spacers */
31.1095 + if (*s == CHAR_UNDERSCORE) c_unders++;
31.1096 + if (*s == CHAR_OPEN_CBRACK) c_brack++;
31.1097 + if (*s == CHAR_CLOSE_CBRACK) c_brack--;
31.1098 + if (*s == CHAR_OPEN_RBRACK) r_brack++;
31.1099 + if (*s == CHAR_CLOSE_RBRACK) r_brack--;
31.1100 + if (*s == CHAR_OPEN_SBRACK) s_brack++;
31.1101 + if (*s == CHAR_CLOSE_SBRACK) s_brack--;
31.1102 + s++;
31.1103 + }
31.1104 +
31.1105 + if (isnewpara && !isemptyline) { /* This line is the start of a new paragraph */
31.1106 + start_para_line = linecnt;
31.1107 + strncpy(parastart, aline, 80); /* Capture its first line in case we want to report it later */
31.1108 + parastart[79] = 0;
31.1109 + dquotepar = squotepar = 0; /* restart the quote count 0.98 */
31.1110 + s = aline;
31.1111 + while (!gcisalpha(*s) && !gcisdigit(*s) && *s) s++; /* V.97 fixed bug - overran line and gave false warning - rare */
31.1112 + if (*s >= 'a' && *s <='z') { /* and its first letter is lowercase */
31.1113 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1114 + if (!pswit[OVERVIEW_SWITCH])
31.1115 + printf(" Line %ld column %d - Paragraph starts with lower-case\n", linecnt, (int)(s - aline) +1);
31.1116 + else
31.1117 + cnt_punct++;
31.1118 + }
31.1119 + isnewpara = 0; /* Signal the end of new para processing */
31.1120 + }
31.1121 +
31.1122 + /* Check for an em-dash broken at line end */
31.1123 + if (enddash && *aline == '-') {
31.1124 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1125 + if (!pswit[OVERVIEW_SWITCH])
31.1126 + printf(" Line %ld column 1 - Broken em-dash?\n", linecnt);
31.1127 + else
31.1128 + cnt_punct++;
31.1129 + }
31.1130 + enddash = 0;
31.1131 + for (s = aline + strlen(aline) - 1; *s == ' ' && s > aline; s--);
31.1132 + if (s >= aline && *s == '-')
31.1133 + enddash = 1;
31.1134 +
31.1135 +
31.1136 + /* Check for invalid or questionable characters in the line */
31.1137 + /* Anything above 127 is invalid for plain ASCII, and */
31.1138 + /* non-printable control characters should also be flagged. */
31.1139 + /* Tabs should generally not be there. */
31.1140 + /* Jan 06, in 0.99: Hm. For some strange reason, I either */
31.1141 + /* never created or deleted the check for unprintable */
31.1142 + /* control characters. They should be reported even if */
31.1143 + /* warn_bin is on, I think, and in full. */
31.1144 +
31.1145 + for (s = aline; *s; s++) {
31.1146 + i = (unsigned char) *s;
31.1147 + if (i < CHAR_SPACE && i != CHAR_LF && i != CHAR_CR && i != CHAR_TAB) {
31.1148 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1149 + if (!pswit[OVERVIEW_SWITCH])
31.1150 + printf(" Line %ld column %d - Control character %d\n", linecnt, (int) (s - aline) + 1, i);
31.1151 + else
31.1152 + cnt_bin++;
31.1153 + }
31.1154 + }
31.1155 +
31.1156 + if (warn_bin) {
31.1157 + eNon_A = eTab = eTilde = eCarat = eFSlash = eAst = 0; /* don't repeat multiple warnings on one line */
31.1158 + for (s = aline; *s; s++) {
31.1159 + if (!eNon_A && ((*s < CHAR_SPACE && *s != 9 && *s != '\n') || (unsigned char)*s > 127)) {
31.1160 + i = *s; /* annoying kludge for signed chars */
31.1161 + if (i < 0) i += 256;
31.1162 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1163 + if (!pswit[OVERVIEW_SWITCH])
31.1164 + if (i > 127 && i < 160)
31.1165 + printf(" Line %ld column %d - Non-ISO-8859 character %d\n", linecnt, (int) (s - aline) + 1, i);
31.1166 + else
31.1167 + printf(" Line %ld column %d - Non-ASCII character %d\n", linecnt, (int) (s - aline) + 1, i);
31.1168 + else
31.1169 + cnt_bin++;
31.1170 + eNon_A = 1;
31.1171 + }
31.1172 + if (!eTab && *s == CHAR_TAB) {
31.1173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1174 + if (!pswit[OVERVIEW_SWITCH])
31.1175 + printf(" Line %ld column %d - Tab character?\n", linecnt, (int) (s - aline) + 1);
31.1176 + else
31.1177 + cnt_odd++;
31.1178 + eTab = 1;
31.1179 + }
31.1180 + if (!eTilde && *s == CHAR_TILDE) { /* often used by OCR software to indicate an unrecognizable character */
31.1181 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1182 + if (!pswit[OVERVIEW_SWITCH])
31.1183 + printf(" Line %ld column %d - Tilde character?\n", linecnt, (int) (s - aline) + 1);
31.1184 + else
31.1185 + cnt_odd++;
31.1186 + eTilde = 1;
31.1187 + }
31.1188 + if (!eCarat && *s == CHAR_CARAT) {
31.1189 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1190 + if (!pswit[OVERVIEW_SWITCH])
31.1191 + printf(" Line %ld column %d - Carat character?\n", linecnt, (int) (s - aline) + 1);
31.1192 + else
31.1193 + cnt_odd++;
31.1194 + eCarat = 1;
31.1195 + }
31.1196 + if (!eFSlash && *s == CHAR_FORESLASH && warn_fslash) {
31.1197 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1198 + if (!pswit[OVERVIEW_SWITCH])
31.1199 + printf(" Line %ld column %d - Forward slash?\n", linecnt, (int) (s - aline) + 1);
31.1200 + else
31.1201 + cnt_odd++;
31.1202 + eFSlash = 1;
31.1203 + }
31.1204 + /* report asterisks only in paranoid mode, since they're often deliberate */
31.1205 + if (!eAst && pswit[PARANOID_SWITCH] && warn_ast && !isemptyline && *s == CHAR_ASTERISK) {
31.1206 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1207 + if (!pswit[OVERVIEW_SWITCH])
31.1208 + printf(" Line %ld column %d - Asterisk?\n", linecnt, (int) (s - aline) + 1);
31.1209 + else
31.1210 + cnt_odd++;
31.1211 + eAst = 1;
31.1212 + }
31.1213 + }
31.1214 + }
31.1215 +
31.1216 + /* Check for line too long */
31.1217 + if (warn_long) {
31.1218 + if (strlen(aline) > LONGEST_PG_LINE) {
31.1219 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1220 + if (!pswit[OVERVIEW_SWITCH])
31.1221 + printf(" Line %ld column %d - Long line %d\n", linecnt, strlen(aline), strlen(aline));
31.1222 + else
31.1223 + cnt_long++;
31.1224 + }
31.1225 + }
31.1226 +
31.1227 + /* Check for line too short. */
31.1228 + /* This one is a bit trickier to implement: we don't want to */
31.1229 + /* flag the last line of a paragraph for being short, so we */
31.1230 + /* have to wait until we know that our current line is a */
31.1231 + /* "normal" line, then report the _previous_ line if it was too */
31.1232 + /* short. We also don't want to report indented lines like */
31.1233 + /* chapter heads or formatted quotations. We therefore keep */
31.1234 + /* lastlen as the length of the last line examined, and */
31.1235 + /* lastblen as the length of the last but one, and try to */
31.1236 + /* suppress unnecessary warnings by checking that both were of */
31.1237 + /* "normal" length. We keep the first character of the last */
31.1238 + /* line in laststart, and if it was a space, we assume that the */
31.1239 + /* formatting is deliberate. I can't figure out a way to */
31.1240 + /* distinguish something like a quoted verse left-aligned or */
31.1241 + /* the header or footer of a letter from a paragraph of short */
31.1242 + /* lines - maybe if I examined the whole paragraph, and if the */
31.1243 + /* para has less than, say, 8 lines and if all lines are short, */
31.1244 + /* then just assume it's OK? Need to look at some texts to see */
31.1245 + /* how often a formula like this would get the right result. */
31.1246 + /* V0.99 changed the tolerance for length to ignore from 2 to 1 */
31.1247 + if (warn_short) {
31.1248 + if (strlen(aline) > 1
31.1249 + && lastlen > 1 && lastlen < SHORTEST_PG_LINE
31.1250 + && lastblen > 1 && lastblen > SHORTEST_PG_LINE
31.1251 + && laststart != CHAR_SPACE) {
31.1252 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
31.1253 + if (!pswit[OVERVIEW_SWITCH])
31.1254 + printf(" Line %ld column %d - Short line %d?\n", linecnt-1, strlen(prevline), strlen(prevline));
31.1255 + else
31.1256 + cnt_short++;
31.1257 + }
31.1258 + }
31.1259 + lastblen = lastlen;
31.1260 + lastlen = strlen(aline);
31.1261 + laststart = aline[0];
31.1262 +
31.1263 + /* look for punctuation at start of line */
31.1264 + if (*aline && strchr(".?!,;:", aline[0])) { /* if it's punctuation */
31.1265 + if (strncmp(". . .", aline, 5)) { /* exception for ellipsis: V.98 tightened up to except only a full ellipsis */
31.1266 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1267 + if (!pswit[OVERVIEW_SWITCH])
31.1268 + printf(" Line %ld column 1 - Begins with punctuation?\n", linecnt);
31.1269 + else
31.1270 + cnt_punct++;
31.1271 + }
31.1272 + }
31.1273 +
31.1274 + /* Check for spaced em-dashes */
31.1275 + /* V.20 must check _all_ occurrences of "--" on the line */
31.1276 + /* hence the loop - even if the first double-dash is OK */
31.1277 + /* there may be another that's wrong later on. */
31.1278 + if (warn_dash) {
31.1279 + s = aline;
31.1280 + while (strstr(s,"--")) {
31.1281 + if (*(strstr(s, "--")-1) == CHAR_SPACE ||
31.1282 + (*(strstr(s, "--")+2) == CHAR_SPACE)) {
31.1283 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1284 + if (!pswit[OVERVIEW_SWITCH])
31.1285 + printf(" Line %ld column %d - Spaced em-dash?\n", linecnt, (int) (strstr(s,"--") - aline) + 1);
31.1286 + else
31.1287 + cnt_dash++;
31.1288 + }
31.1289 + s = strstr(s,"--") + 2;
31.1290 + }
31.1291 + }
31.1292 +
31.1293 + /* Check for spaced dashes */
31.1294 + if (warn_dash)
31.1295 + if (strstr(aline," -")) {
31.1296 + if (*(strstr(aline, " -")+2) != '-') {
31.1297 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1298 + if (!pswit[OVERVIEW_SWITCH])
31.1299 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline," -") - aline) + 1);
31.1300 + else
31.1301 + cnt_dash++;
31.1302 + }
31.1303 + }
31.1304 + else
31.1305 + if (strstr(aline,"- ")) {
31.1306 + if (*(strstr(aline, "- ")-1) != '-') {
31.1307 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1308 + if (!pswit[OVERVIEW_SWITCH])
31.1309 + printf(" Line %ld column %d - Spaced dash?\n", linecnt, (int) (strstr(aline,"- ") - aline) + 1);
31.1310 + else
31.1311 + cnt_dash++;
31.1312 + }
31.1313 + }
31.1314 +
31.1315 + /* v 0.99 */
31.1316 + /* Check for unmarked paragraphs indicated by separate speakers */
31.1317 + /* May well be false positive: */
31.1318 + /* "Bravo!" "Wonderful!" called the crowd. */
31.1319 + /* but useful all the same. */
31.1320 + s = wrk;
31.1321 + *s = 0;
31.1322 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
31.1323 + if (strstr(aline, "\" \"")) s = strstr(aline, "\" \"");
31.1324 + if (*s) {
31.1325 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1326 + if (!pswit[OVERVIEW_SWITCH])
31.1327 + printf(" Line %ld column %d - Query missing paragraph break?\n", linecnt, (int)(s - aline) +1);
31.1328 + else
31.1329 + cnt_punct++;
31.1330 + }
31.1331 +
31.1332 +
31.1333 +
31.1334 + /* Check for "to he" and other easy he/be errors */
31.1335 + /* This is a very inadequate effort on the he/be problem, */
31.1336 + /* but the phrase "to he" is always an error, whereas "to */
31.1337 + /* be" is quite common. I chuckle when it does catch one! */
31.1338 + /* Similarly, '"Quiet!", be said.' is a non-be error */
31.1339 + /* V .18 - "to he" is _not_ always an error!: */
31.1340 + /* "Where they went to he couldn't say." */
31.1341 + /* but I'm leaving it in anyway. */
31.1342 + /* V .20 Another false positive: */
31.1343 + /* What would "Cinderella" be without the . . . */
31.1344 + /* and another "If he wants to he can see for himself." */
31.1345 + /* V .21 Added " is be " and " be is " and " be was " */
31.1346 + /* V .99 Added jeebies code -- removed again. */
31.1347 + /* Is jeebies code worth adding? Rare to see he/be */
31.1348 + /* errors with modern OCR. Separate program? Yes! */
31.1349 + /* jeebies does the job without cluttering up this. */
31.1350 + /* We do get a few more queryable pairs from the */
31.1351 + /* project though -- they're cheap to implement. */
31.1352 + /* Also added a column number for guiguts. */
31.1353 +
31.1354 + s = wrk;
31.1355 + *s = 0;
31.1356 + if (strstr(aline," to he ")) s = strstr(aline," to he ");
31.1357 + if (strstr(aline,"\" be ")) s = strstr(aline,"\" be ");
31.1358 + if (strstr(aline,"\", be ")) s = strstr(aline,"\", be ");
31.1359 + if (strstr(aline," is be ")) s = strstr(aline," is be ");
31.1360 + if (strstr(aline," be is ")) s = strstr(aline," be is ");
31.1361 + if (strstr(aline," was be ")) s = strstr(aline," was be ");
31.1362 + if (strstr(aline," be would ")) s = strstr(aline," be would ");
31.1363 + if (strstr(aline," be could ")) s = strstr(aline," be could ");
31.1364 + if (*s) {
31.1365 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1366 + if (!pswit[OVERVIEW_SWITCH])
31.1367 + printf(" Line %ld column %d - Query he/be error?\n", linecnt, (int)(s - aline) +1);
31.1368 + else
31.1369 + cnt_word++;
31.1370 + }
31.1371 +
31.1372 + s = wrk;
31.1373 + *s = 0;
31.1374 + if (strstr(aline," i bad ")) s = strstr(aline," i bad ");
31.1375 + if (strstr(aline," you bad ")) s = strstr(aline," you bad ");
31.1376 + if (strstr(aline," he bad ")) s = strstr(aline," he bad ");
31.1377 + if (strstr(aline," she bad ")) s = strstr(aline," she bad ");
31.1378 + if (strstr(aline," they bad ")) s = strstr(aline," they bad ");
31.1379 + if (strstr(aline," a had ")) s = strstr(aline," a had ");
31.1380 + if (strstr(aline," the had ")) s = strstr(aline," the had ");
31.1381 + if (*s) {
31.1382 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1383 + if (!pswit[OVERVIEW_SWITCH])
31.1384 + printf(" Line %ld column %d - Query had/bad error?\n", linecnt, (int)(s - aline) +1);
31.1385 + else
31.1386 + cnt_word++;
31.1387 + }
31.1388 +
31.1389 +
31.1390 + /* V .97 Added ", hut " Not too common, hut pretty certain */
31.1391 + /* V.99 changed to add a column number for guiguts */
31.1392 + s = wrk;
31.1393 + *s = 0;
31.1394 + if (strstr(aline,", hut ")) s = strstr(aline,", hut ");
31.1395 + if (strstr(aline,"; hut ")) s = strstr(aline,"; hut ");
31.1396 + if (*s) {
31.1397 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1398 + if (!pswit[OVERVIEW_SWITCH])
31.1399 + printf(" Line %ld column %d - Query hut/but error?\n", linecnt, (int)(s - aline) +1);
31.1400 + else
31.1401 + cnt_word++;
31.1402 + }
31.1403 +
31.1404 + /* Special case - angled bracket in front of "From" placed there by an MTA */
31.1405 + /* when sending an e-mail. V .21 */
31.1406 + if (strstr(aline, ">From")) {
31.1407 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1408 + if (!pswit[OVERVIEW_SWITCH])
31.1409 + printf(" Line %ld column %d - Query angled bracket with From\n", linecnt, (int)(strstr(aline, ">From") - aline) +1);
31.1410 + else
31.1411 + cnt_punct++;
31.1412 + }
31.1413 +
31.1414 + /* V 0.98 Check for a single character line - often an overflow from bad wrapping. */
31.1415 + if (*aline && !*(aline+1)) {
31.1416 + if (*aline == 'I' || *aline == 'V' || *aline == 'X' || *aline == 'L' || gcisdigit(*aline))
31.1417 + ; /* nothing - ignore numerals alone on a line. */
31.1418 + else {
31.1419 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1420 + if (!pswit[OVERVIEW_SWITCH])
31.1421 + printf(" Line %ld column 1 - Query single character line\n", linecnt);
31.1422 + else
31.1423 + cnt_punct++;
31.1424 + }
31.1425 + }
31.1426 +
31.1427 + /* V 0.98 Check for I" - often should be ! */
31.1428 + if (strstr(aline, " I\"")) {
31.1429 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1430 + if (!pswit[OVERVIEW_SWITCH])
31.1431 + printf(" Line %ld column %ld - Query I=exclamation mark?\n", linecnt, strstr(aline, " I\"") - aline);
31.1432 + else
31.1433 + cnt_punct++;
31.1434 + }
31.1435 +
31.1436 + /* V 0.98 Check for period without a capital letter. Cut-down from gutspell */
31.1437 + /* Only works when it happens on a single line. */
31.1438 +
31.1439 + if (pswit[PARANOID_SWITCH])
31.1440 + for (t = s = aline; strstr(t,". ");) {
31.1441 + t = strstr(t, ". ");
31.1442 + if (t == s) {
31.1443 + t++;
31.1444 + continue; /* start of line punctuation is handled elsewhere */
31.1445 + }
31.1446 + if (!gcisalpha(*(t-1))) {
31.1447 + t++;
31.1448 + continue;
31.1449 + }
31.1450 + if (isDutch) { /* For Frank & Jeroen -- 's Middags case */
31.1451 + if (*(t+2) == CHAR_SQUOTE &&
31.1452 + *(t+3)>='a' && *(t+3)<='z' &&
31.1453 + *(t+4) == CHAR_SPACE &&
31.1454 + *(t+5)>='A' && *(t+5)<='Z') {
31.1455 + t++;
31.1456 + continue;
31.1457 + }
31.1458 + }
31.1459 + s1 = t+2;
31.1460 + while (*s1 && !gcisalpha(*s1) && !isdigit(*s1))
31.1461 + s1++;
31.1462 + if (*s1 >= 'a' && *s1 <= 'z') { /* we have something to investigate */
31.1463 + istypo = 1;
31.1464 + for (s1 = t - 1; s1 >= s &&
31.1465 + (gcisalpha(*s1) || gcisdigit(*s1) ||
31.1466 + (*s1 == CHAR_SQUOTE && gcisalpha(*(s1+1)) && gcisalpha(*(s1-1)))); s1--); /* so let's go back and find out */
31.1467 + s1++;
31.1468 + for (i = 0; *s1 && *s1 != '.'; s1++, i++)
31.1469 + testword[i] = *s1;
31.1470 + testword[i] = 0;
31.1471 + for (i = 0; *abbrev[i]; i++)
31.1472 + if (!strcmp(testword, abbrev[i]))
31.1473 + istypo = 0;
31.1474 +// if (*testword >= 'A' && *testword <= 'Z')
31.1475 +// istypo = 0;
31.1476 + if (gcisdigit(*testword)) istypo = 0;
31.1477 + if (!*(testword+1)) istypo = 0;
31.1478 + if (isroman(testword)) istypo = 0;
31.1479 + if (istypo) {
31.1480 + istypo = 0;
31.1481 + for (i = 0; testword[i]; i++)
31.1482 + if (strchr(vowels, testword[i]))
31.1483 + istypo = 1;
31.1484 + }
31.1485 + if (istypo) {
31.1486 + isdup = 0;
31.1487 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
31.1488 + for (i = 0; i < qperiod_index; i++)
31.1489 + if (!strcmp(testword, qperiod[i])) {
31.1490 + isdup = 1;
31.1491 + }
31.1492 + if (!isdup) {
31.1493 + if (qperiod_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
31.1494 + strcpy(qperiod[qperiod_index], testword);
31.1495 + qperiod_index++;
31.1496 + }
31.1497 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1498 + if (!pswit[OVERVIEW_SWITCH])
31.1499 + printf(" Line %ld column %d - Extra period?\n", linecnt, (int)(t - aline)+1);
31.1500 + else
31.1501 + cnt_punct++;
31.1502 + }
31.1503 + }
31.1504 + }
31.1505 + t++;
31.1506 + }
31.1507 +
31.1508 +
31.1509 + if (pswit[TYPO_SWITCH]) { /* Should have put this condition in at the start of 0.99. Duh! */
31.1510 + /* Check for words usually not followed by punctuation 0.99 */
31.1511 + for (s = aline; *s;) {
31.1512 + wordstart = s;
31.1513 + s = getaword(s, inword);
31.1514 + if (!*inword) continue;
31.1515 + lowerit(inword);
31.1516 + for (i = 0; *nocomma[i]; i++)
31.1517 + if (!strcmp(inword, nocomma[i])) {
31.1518 + if (*s == ',' || *s == ';' || *s == ':') {
31.1519 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1520 + if (!pswit[OVERVIEW_SWITCH])
31.1521 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
31.1522 + else
31.1523 + cnt_punct++;
31.1524 + }
31.1525 + }
31.1526 + for (i = 0; *noperiod[i]; i++)
31.1527 + if (!strcmp(inword, noperiod[i])) {
31.1528 + if (*s == '.' || *s == '!') {
31.1529 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1530 + if (!pswit[OVERVIEW_SWITCH])
31.1531 + printf(" Line %ld column %d - Query punctuation after %s?\n", linecnt, (int)(s - aline)+1, inword);
31.1532 + else
31.1533 + cnt_punct++;
31.1534 + }
31.1535 + }
31.1536 + }
31.1537 + }
31.1538 +
31.1539 +
31.1540 +
31.1541 + /* Check for commonly mistyped words, and digits like 0 for O in a word */
31.1542 + for (s = aline; *s;) {
31.1543 + wordstart = s;
31.1544 + s = getaword(s, inword);
31.1545 + if (!*inword) continue; /* don't bother with empty lines */
31.1546 + if (mixdigit(inword)) {
31.1547 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1548 + if (!pswit[OVERVIEW_SWITCH])
31.1549 + printf(" Line %ld column %ld - Query digit in %s\n", linecnt, (int)(wordstart - aline) + 1, inword);
31.1550 + else
31.1551 + cnt_word++;
31.1552 + }
31.1553 +
31.1554 + /* put the word through a series of tests for likely typos and OCR errors */
31.1555 + /* V.21 I had allowed lots of typo-checking even with the typo switch */
31.1556 + /* turned off, but I really should disallow reporting of them when */
31.1557 + /* the switch is off. Hence the "if" below. */
31.1558 + if (pswit[TYPO_SWITCH]) {
31.1559 + istypo = 0;
31.1560 + strcpy(testword, inword);
31.1561 + alower = 0;
31.1562 + for (i = 0; i < (signed int)strlen(testword); i++) { /* lowercase for testing */
31.1563 + if (testword[i] >= 'a' && testword[i] <= 'z') alower = 1;
31.1564 + if (alower && testword[i] >= 'A' && testword[i] <= 'Z') {
31.1565 + /* we have an uppercase mid-word. However, there are common cases: */
31.1566 + /* Mac and Mc like McGill */
31.1567 + /* French contractions like l'Abbe */
31.1568 + if ((i == 2 && testword[0] == 'm' && testword[1] == 'c') ||
31.1569 + (i == 3 && testword[0] == 'm' && testword[1] == 'a' && testword[2] == 'c') ||
31.1570 + (i > 0 && testword[i-1] == CHAR_SQUOTE))
31.1571 + ; /* do nothing! */
31.1572 +
31.1573 + else { /* V.97 - remove separate case of uppercase within word so that */
31.1574 + /* names like VanAllen fall into qword_index and get reported only once */
31.1575 + istypo = 1;
31.1576 + }
31.1577 + }
31.1578 + testword[i] = (char)tolower(testword[i]);
31.1579 + }
31.1580 +
31.1581 + /* check for certain unlikely two-letter combinations at word start and end */
31.1582 + /* V.0.97 - this replaces individual hardcoded checks in previous versions */
31.1583 + if (strlen(testword) > 1) {
31.1584 + for (i = 0; *nostart[i]; i++)
31.1585 + if (!strncmp(testword, nostart[i], 2))
31.1586 + istypo = 1;
31.1587 + for (i = 0; *noend[i]; i++)
31.1588 + if (!strncmp(testword + strlen(testword) -2, noend[i], 2))
31.1589 + istypo = 1;
31.1590 + }
31.1591 +
31.1592 +
31.1593 + /* ght is common, gbt never. Like that. */
31.1594 + if (strstr(testword, "cb")) istypo = 1;
31.1595 + if (strstr(testword, "gbt")) istypo = 1;
31.1596 + if (strstr(testword, "pbt")) istypo = 1;
31.1597 + if (strstr(testword, "tbs")) istypo = 1;
31.1598 + if (strstr(testword, "mrn")) istypo = 1;
31.1599 + if (strstr(testword, "ahle")) istypo = 1;
31.1600 + if (strstr(testword, "ihle")) istypo = 1;
31.1601 +
31.1602 + /* "TBE" does happen - like HEARTBEAT - but uncommon. */
31.1603 + /* Also "TBI" - frostbite, outbid - but uncommon. */
31.1604 + /* Similarly "ii" like Hawaii, or Pompeii, and in Roman numerals, */
31.1605 + /* but these are covered in V.20. "ii" is a common scanno. */
31.1606 + if (strstr(testword, "tbi")) istypo = 1;
31.1607 + if (strstr(testword, "tbe")) istypo = 1;
31.1608 + if (strstr(testword, "ii")) istypo = 1;
31.1609 +
31.1610 + /* check for no vowels or no consonants. */
31.1611 + /* If none, flag a typo */
31.1612 + if (!istypo && strlen(testword)>1) {
31.1613 + vowel = consonant = 0;
31.1614 + for (i = 0; testword[i]; i++)
31.1615 + if (testword[i] == 'y' || gcisdigit(testword[i])) { /* Yah, this is loose. */
31.1616 + vowel++;
31.1617 + consonant++;
31.1618 + }
31.1619 + else
31.1620 + if (strchr(vowels, testword[i])) vowel++;
31.1621 + else consonant++;
31.1622 + if (!vowel || !consonant) {
31.1623 + istypo = 1;
31.1624 + }
31.1625 + }
31.1626 +
31.1627 + /* now exclude the word from being reported if it's in */
31.1628 + /* the okword list */
31.1629 + for (i = 0; *okword[i]; i++)
31.1630 + if (!strcmp(testword, okword[i]))
31.1631 + istypo = 0;
31.1632 +
31.1633 + /* what looks like a typo may be a Roman numeral. Exclude these */
31.1634 + if (istypo)
31.1635 + if (isroman(testword))
31.1636 + istypo = 0;
31.1637 +
31.1638 + /* check the manual list of typos */
31.1639 + if (!istypo)
31.1640 + for (i = 0; *typo[i]; i++)
31.1641 + if (!strcmp(testword, typo[i]))
31.1642 + istypo = 1;
31.1643 +
31.1644 +
31.1645 + /* V.21 - check lowercase s and l - special cases */
31.1646 + /* V.98 - added "i" and "m" */
31.1647 + /* V.99 - added "j" often a semi-colon gone wrong */
31.1648 + /* - and "d" for a missing apostrophe - he d */
31.1649 + /* - and "n" for "in" */
31.1650 + if (!istypo && strlen(testword) == 1)
31.1651 + if (strchr("slmijdn", *inword))
31.1652 + istypo = 1;
31.1653 +
31.1654 +
31.1655 + if (istypo) {
31.1656 + isdup = 0;
31.1657 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
31.1658 + for (i = 0; i < qword_index; i++)
31.1659 + if (!strcmp(testword, qword[i])) {
31.1660 + isdup = 1;
31.1661 + ++dupcnt[i];
31.1662 + }
31.1663 + if (!isdup) {
31.1664 + if (qword_index < MAX_QWORD && strlen(testword) < MAX_QWORD_LENGTH) {
31.1665 + strcpy(qword[qword_index], testword);
31.1666 + qword_index++;
31.1667 + }
31.1668 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1669 + if (!pswit[OVERVIEW_SWITCH]) {
31.1670 + printf(" Line %ld column %d - Query word %s", linecnt, (int)(wordstart - aline) + 1, inword);
31.1671 + if (strlen(testword) < MAX_QWORD_LENGTH && !pswit[VERBOSE_SWITCH])
31.1672 + printf(" - not reporting duplicates");
31.1673 + printf("\n");
31.1674 + }
31.1675 + else
31.1676 + cnt_word++;
31.1677 + }
31.1678 + }
31.1679 + } /* end of typo-checking */
31.1680 +
31.1681 + /* check the user's list of typos */
31.1682 + if (!istypo)
31.1683 + if (usertypo_count)
31.1684 + for (i = 0; i < usertypo_count; i++)
31.1685 + if (!strcmp(testword, usertypo[i])) {
31.1686 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1687 + if (!pswit[OVERVIEW_SWITCH])
31.1688 + printf(" Line %ld column %d - Query possible scanno %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
31.1689 + }
31.1690 +
31.1691 +
31.1692 +
31.1693 + if (pswit[PARANOID_SWITCH] && warn_digit) { /* in paranoid mode, query all 0 and 1 standing alone - added warn_digit V.97*/
31.1694 + if (!strcmp(inword, "0") || !strcmp(inword, "1")) {
31.1695 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1696 + if (!pswit[OVERVIEW_SWITCH])
31.1697 + printf(" Line %ld column %d - Query standalone %s\n", linecnt, (int)(wordstart - aline) + 2, inword);
31.1698 + else
31.1699 + cnt_word++;
31.1700 + }
31.1701 + }
31.1702 + }
31.1703 +
31.1704 + /* look for added or missing spaces around punctuation and quotes */
31.1705 + /* If there is a punctuation character like ! with no space on */
31.1706 + /* either side, suspect a missing!space. If there are spaces on */
31.1707 + /* both sides , assume a typo. If we see a double quote with no */
31.1708 + /* space or punctuation on either side of it, assume unspaced */
31.1709 + /* quotes "like"this. */
31.1710 + llen = strlen(aline);
31.1711 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
31.1712 + if (strchr(".?!,;:_", aline[i])) { /* if it's punctuation */
31.1713 + isacro = 0; /* we need to suppress warnings for acronyms like M.D. */
31.1714 + isellipsis = 0; /* we need to suppress warnings for ellipsis . . . */
31.1715 + if ( (gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) || /* if there are letters on both sides of it or ... */
31.1716 + (gcisalpha(aline[i+1]) && strchr("?!,;:", aline[i]))) { /* ...if it's strict punctuation followed by an alpha */
31.1717 + if (aline[i] == '.') {
31.1718 + if (i > 2)
31.1719 + if (aline[i-2] == '.') isacro = 1;
31.1720 + if (i + 2 < llen)
31.1721 + if (aline[i+2] == '.') isacro = 1;
31.1722 + }
31.1723 + if (!isacro) {
31.1724 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1725 + if (!pswit[OVERVIEW_SWITCH])
31.1726 + printf(" Line %ld column %d - Missing space?\n", linecnt, i+1);
31.1727 + else
31.1728 + cnt_punct++;
31.1729 + }
31.1730 + }
31.1731 + if (aline[i-1] == CHAR_SPACE && (aline[i+1] == CHAR_SPACE || aline[i+1] == 0)) { /* if there are spaces on both sides, or space before and end of line */
31.1732 + if (aline[i] == '.') {
31.1733 + if (i > 2)
31.1734 + if (aline[i-2] == '.') isellipsis = 1;
31.1735 + if (i + 2 < llen)
31.1736 + if (aline[i+2] == '.') isellipsis = 1;
31.1737 + }
31.1738 + if (!isemptyline && !isellipsis) {
31.1739 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1740 + if (!pswit[OVERVIEW_SWITCH])
31.1741 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
31.1742 + else
31.1743 + cnt_punct++;
31.1744 + }
31.1745 + }
31.1746 + }
31.1747 + }
31.1748 +
31.1749 + /* 0.98 -- split out the characters that CANNOT be preceded by space */
31.1750 + llen = strlen(aline);
31.1751 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
31.1752 + if (strchr("?!,;:", aline[i])) { /* if it's punctuation that _cannot_ have a space before it */
31.1753 + if (aline[i-1] == CHAR_SPACE && !isemptyline && aline[i+1] != CHAR_SPACE) { /* if aline[i+1) DOES == space, it was already reported just above */
31.1754 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1755 + if (!pswit[OVERVIEW_SWITCH])
31.1756 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
31.1757 + else
31.1758 + cnt_punct++;
31.1759 + }
31.1760 + }
31.1761 + }
31.1762 +
31.1763 +
31.1764 + /* 0.99 -- special case " .X" where X is any alpha. */
31.1765 + /* This plugs a hole in the acronym code above. Inelegant, but maintainable. */
31.1766 + llen = strlen(aline);
31.1767 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
31.1768 + if (aline[i] == '.') { /* if it's a period */
31.1769 + if (aline[i-1] == CHAR_SPACE && gcisalpha(aline[i+1])) { /* if the period follows a space and is followed by a letter */
31.1770 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1771 + if (!pswit[OVERVIEW_SWITCH])
31.1772 + printf(" Line %ld column %d - Spaced punctuation?\n", linecnt, i+1);
31.1773 + else
31.1774 + cnt_punct++;
31.1775 + }
31.1776 + }
31.1777 + }
31.1778 +
31.1779 +
31.1780 +
31.1781 +
31.1782 + /* v.21 breaking out the search for unspaced doublequotes */
31.1783 + /* This is not as efficient, but it's more maintainable */
31.1784 + /* V.97 added underscore to the list of characters not to query, */
31.1785 + /* since underscores are commonly used as italics indicators. */
31.1786 + /* V.98 Added slash as well, same reason. */
31.1787 + for (i = 1; i < llen; i++) { /* for each character in the line after the first */
31.1788 + if (aline[i] == CHAR_DQUOTE) {
31.1789 + if ((!strchr(" _-.'`,;:!/([{?}])", aline[i-1]) &&
31.1790 + !strchr(" _-.'`,;:!/([{?}])", aline[i+1]) &&
31.1791 + aline[i+1] != 0
31.1792 + || (!strchr(" _-([{'`", aline[i-1]) && gcisalpha(aline[i+1])))) {
31.1793 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1794 + if (!pswit[OVERVIEW_SWITCH])
31.1795 + printf(" Line %ld column %d - Unspaced quotes?\n", linecnt, i+1);
31.1796 + else
31.1797 + cnt_punct++;
31.1798 + }
31.1799 + }
31.1800 + }
31.1801 +
31.1802 +
31.1803 + /* v.98 check parity of quotes */
31.1804 + /* v.99 added !*(s+1) in some tests to catch "I am," he said, but I will not be soon". */
31.1805 + for (s = aline; *s; s++) {
31.1806 + if (*s == CHAR_DQUOTE) {
31.1807 + if (!(dquotepar = !dquotepar)) { /* parity even */
31.1808 + if (!strchr("_-.'`/,;:!?)]} ", *(s+1))) {
31.1809 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1810 + if (!pswit[OVERVIEW_SWITCH])
31.1811 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
31.1812 + else
31.1813 + cnt_punct++;
31.1814 + }
31.1815 + }
31.1816 + else { /* parity odd */
31.1817 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/.'`([{$", *(s+1)) || !*(s+1)) {
31.1818 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1819 + if (!pswit[OVERVIEW_SWITCH])
31.1820 + printf(" Line %ld column %d - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
31.1821 + else
31.1822 + cnt_punct++;
31.1823 + }
31.1824 + }
31.1825 + }
31.1826 + }
31.1827 +
31.1828 + if (*aline == CHAR_DQUOTE) {
31.1829 + if (strchr(",;:!?)]} ", aline[1])) {
31.1830 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1831 + if (!pswit[OVERVIEW_SWITCH])
31.1832 + printf(" Line %ld column 1 - Wrongspaced quotes?\n", linecnt, (int)(s - aline)+1);
31.1833 + else
31.1834 + cnt_punct++;
31.1835 + }
31.1836 + }
31.1837 +
31.1838 + if (pswit[SQUOTE_SWITCH])
31.1839 + for (s = aline; *s; s++) {
31.1840 + if ((*s == CHAR_SQUOTE || *s == CHAR_OPEN_SQUOTE)
31.1841 + && ( s == aline || (s > aline && !gcisalpha(*(s-1))) || !gcisalpha(*(s+1)))) {
31.1842 + if (!(squotepar = !squotepar)) { /* parity even */
31.1843 + if (!strchr("_-.'`/\",;:!?)]} ", *(s+1))) {
31.1844 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1845 + if (!pswit[OVERVIEW_SWITCH])
31.1846 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
31.1847 + else
31.1848 + cnt_punct++;
31.1849 + }
31.1850 + }
31.1851 + else { /* parity odd */
31.1852 + if (!gcisalpha(*(s+1)) && !isdigit(*(s+1)) && !strchr("_-/\".'`", *(s+1)) || !*(s+1)) {
31.1853 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1854 + if (!pswit[OVERVIEW_SWITCH])
31.1855 + printf(" Line %ld column %d - Wrongspaced singlequotes?\n", linecnt, (int)(s - aline)+1);
31.1856 + else
31.1857 + cnt_punct++;
31.1858 + }
31.1859 + }
31.1860 + }
31.1861 + }
31.1862 +
31.1863 +
31.1864 + /* v.20 also look for double punctuation like ,. or ,, */
31.1865 + /* Thanks to DW for the suggestion! */
31.1866 + /* I'm putting this in a separate loop for clarity */
31.1867 + /* In books with references, ".," and ".;" are common */
31.1868 + /* e.g. "etc., etc.," and vol. 1.; vol 3.; */
31.1869 + /* OTOH, from my initial tests, there are also fairly */
31.1870 + /* common errors. What to do? Make these cases paranoid? */
31.1871 + /* V.21 ".," is the most common, so invented warn_dotcomma */
31.1872 + /* to suppress detailed reporting if it occurs often */
31.1873 + llen = strlen(aline);
31.1874 + for (i = 0; i < llen; i++) /* for each character in the line */
31.1875 + if (strchr(".?!,;:", aline[i]) /* if it's punctuation */
31.1876 + && (strchr(".?!,;:", aline[i+1]))
31.1877 + && aline[i] && aline[i+1]) /* followed by punctuation, it's a query, unless . . . */
31.1878 + if (
31.1879 + (aline[i] == aline[i+1]
31.1880 + && (aline[i] == '.' || aline[i] == '?' || aline[i] == '!'))
31.1881 + || (!warn_dotcomma && aline[i] == '.' && aline[i+1] == ',')
31.1882 + || (isFrench && !strncmp(aline+i, ",...", 4))
31.1883 + || (isFrench && !strncmp(aline+i, "...,", 4))
31.1884 + || (isFrench && !strncmp(aline+i, ";...", 4))
31.1885 + || (isFrench && !strncmp(aline+i, "...;", 4))
31.1886 + || (isFrench && !strncmp(aline+i, ":...", 4))
31.1887 + || (isFrench && !strncmp(aline+i, "...:", 4))
31.1888 + || (isFrench && !strncmp(aline+i, "!...", 4))
31.1889 + || (isFrench && !strncmp(aline+i, "...!", 4))
31.1890 + || (isFrench && !strncmp(aline+i, "?...", 4))
31.1891 + || (isFrench && !strncmp(aline+i, "...?", 4))
31.1892 + ) {
31.1893 + if ((isFrench && !strncmp(aline+i, ",...", 4)) /* could this BE any more awkward? */
31.1894 + || (isFrench && !strncmp(aline+i, "...,", 4))
31.1895 + || (isFrench && !strncmp(aline+i, ";...", 4))
31.1896 + || (isFrench && !strncmp(aline+i, "...;", 4))
31.1897 + || (isFrench && !strncmp(aline+i, ":...", 4))
31.1898 + || (isFrench && !strncmp(aline+i, "...:", 4))
31.1899 + || (isFrench && !strncmp(aline+i, "!...", 4))
31.1900 + || (isFrench && !strncmp(aline+i, "...!", 4))
31.1901 + || (isFrench && !strncmp(aline+i, "?...", 4))
31.1902 + || (isFrench && !strncmp(aline+i, "...?", 4)))
31.1903 + i +=4;
31.1904 + ; /* do nothing for .. !! and ?? which can be legit */
31.1905 + }
31.1906 + else {
31.1907 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1908 + if (!pswit[OVERVIEW_SWITCH])
31.1909 + printf(" Line %ld column %d - Double punctuation?\n", linecnt, i+1);
31.1910 + else
31.1911 + cnt_punct++;
31.1912 + }
31.1913 +
31.1914 + /* v.21 breaking out the search for spaced doublequotes */
31.1915 + /* This is not as efficient, but it's more maintainable */
31.1916 + s = aline;
31.1917 + while (strstr(s," \" ")) {
31.1918 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1919 + if (!pswit[OVERVIEW_SWITCH])
31.1920 + printf(" Line %ld column %d - Spaced doublequote?\n", linecnt, (int)(strstr(s," \" ")-aline+1));
31.1921 + else
31.1922 + cnt_punct++;
31.1923 + s = strstr(s," \" ") + 2;
31.1924 + }
31.1925 +
31.1926 + /* v.20 also look for spaced singlequotes ' and ` */
31.1927 + s = aline;
31.1928 + while (strstr(s," ' ")) {
31.1929 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1930 + if (!pswit[OVERVIEW_SWITCH])
31.1931 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ' ")-aline+1));
31.1932 + else
31.1933 + cnt_punct++;
31.1934 + s = strstr(s," ' ") + 2;
31.1935 + }
31.1936 +
31.1937 + s = aline;
31.1938 + while (strstr(s," ` ")) {
31.1939 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1940 + if (!pswit[OVERVIEW_SWITCH])
31.1941 + printf(" Line %ld column %d - Spaced singlequote?\n", linecnt, (int)(strstr(s," ` ")-aline+1));
31.1942 + else
31.1943 + cnt_punct++;
31.1944 + s = strstr(s," ` ") + 2;
31.1945 + }
31.1946 +
31.1947 + /* v.99 check special case of 'S instead of 's at end of word */
31.1948 + s = aline + 1;
31.1949 + while (*s) {
31.1950 + if (*s == CHAR_SQUOTE && *(s+1) == 'S' && *(s-1)>='a' && *(s-1)<='z') {
31.1951 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1952 + if (!pswit[OVERVIEW_SWITCH])
31.1953 + printf(" Line %ld column %d - Capital \"S\"?\n", linecnt, (int)(s-aline+2));
31.1954 + else
31.1955 + cnt_punct++;
31.1956 + }
31.1957 + s++;
31.1958 + }
31.1959 +
31.1960 +
31.1961 + /* v.21 Now check special cases - start and end of line - */
31.1962 + /* for single and double quotes. Start is sometimes [sic] */
31.1963 + /* but better to query it anyway. */
31.1964 + /* While I'm here, check for dash at end of line */
31.1965 + llen = strlen(aline);
31.1966 + if (llen > 1) {
31.1967 + if (aline[llen-1] == CHAR_DQUOTE ||
31.1968 + aline[llen-1] == CHAR_SQUOTE ||
31.1969 + aline[llen-1] == CHAR_OPEN_SQUOTE)
31.1970 + if (aline[llen-2] == CHAR_SPACE) {
31.1971 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1972 + if (!pswit[OVERVIEW_SWITCH])
31.1973 + printf(" Line %ld column %d - Spaced quote?\n", linecnt, llen);
31.1974 + else
31.1975 + cnt_punct++;
31.1976 + }
31.1977 +
31.1978 + /* V 0.98 removed aline[0] == CHAR_DQUOTE from the test below, since */
31.1979 + /* Wrongspaced quotes test also catches it for " */
31.1980 + if (aline[0] == CHAR_SQUOTE ||
31.1981 + aline[0] == CHAR_OPEN_SQUOTE)
31.1982 + if (aline[1] == CHAR_SPACE) {
31.1983 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1984 + if (!pswit[OVERVIEW_SWITCH])
31.1985 + printf(" Line %ld column 1 - Spaced quote?\n", linecnt);
31.1986 + else
31.1987 + cnt_punct++;
31.1988 + }
31.1989 + /* dash at end of line may well be legit - paranoid mode only */
31.1990 + /* and don't report em-dash at line-end */
31.1991 + if (pswit[PARANOID_SWITCH] && warn_hyphen) {
31.1992 + for (i = llen-1; i > 0 && (unsigned char)aline[i] <= CHAR_SPACE; i--);
31.1993 + if (aline[i] == '-' && aline[i-1] != '-') {
31.1994 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.1995 + if (!pswit[OVERVIEW_SWITCH])
31.1996 + printf(" Line %ld column %d - Hyphen at end of line?\n", linecnt, i);
31.1997 + }
31.1998 + }
31.1999 + }
31.2000 +
31.2001 + /* v.21 also look for brackets surrounded by alpha */
31.2002 + /* Brackets are often unspaced, but shouldn't be surrounded by alpha. */
31.2003 + /* If so, suspect a scanno like "a]most" */
31.2004 + llen = strlen(aline);
31.2005 + for (i = 1; i < llen-1; i++) { /* for each character in the line except 1st & last*/
31.2006 + if (strchr("{[()]}", aline[i]) /* if it's a bracket */
31.2007 + && gcisalpha(aline[i-1]) && gcisalpha(aline[i+1])) {
31.2008 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.2009 + if (!pswit[OVERVIEW_SWITCH])
31.2010 + printf(" Line %ld column %d - Unspaced bracket?\n", linecnt, i);
31.2011 + else
31.2012 + cnt_punct++;
31.2013 + }
31.2014 + }
31.2015 + /* The "Cinderella" case, back in again! :-S Give it another shot */
31.2016 + if (warn_endquote) {
31.2017 + llen = strlen(aline);
31.2018 + for (i = 1; i < llen; i++) { /* for each character in the line except 1st */
31.2019 + if (aline[i] == CHAR_DQUOTE)
31.2020 + if (isalpha(aline[i-1])) {
31.2021 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.2022 + if (!pswit[OVERVIEW_SWITCH])
31.2023 + printf(" Line %ld column %d - endquote missing punctuation?\n", linecnt, i);
31.2024 + else
31.2025 + cnt_punct++;
31.2026 + }
31.2027 + }
31.2028 + }
31.2029 +
31.2030 + llen = strlen(aline);
31.2031 +
31.2032 + /* Check for <HTML TAG> */
31.2033 + /* If there is a < in the line, followed at some point */
31.2034 + /* by a > then we suspect HTML */
31.2035 + if (strstr(aline, "<") && strstr(aline, ">")) {
31.2036 + i = (signed int) (strstr(aline, ">") - strstr(aline, "<") + 1);
31.2037 + if (i > 0) {
31.2038 + strncpy(wrk, strstr(aline, "<"), i);
31.2039 + wrk[i] = 0;
31.2040 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.2041 + if (!pswit[OVERVIEW_SWITCH])
31.2042 + printf(" Line %ld column %d - HTML Tag? %s \n", linecnt, (int)(strstr(aline, "<") - aline) + 1, wrk);
31.2043 + else
31.2044 + cnt_html++;
31.2045 + }
31.2046 + }
31.2047 +
31.2048 + /* Check for &symbol; HTML */
31.2049 + /* If there is a & in the line, followed at */
31.2050 + /* some point by a ; then we suspect HTML */
31.2051 + if (strstr(aline, "&") && strstr(aline, ";")) {
31.2052 + i = (int)(strstr(aline, ";") - strstr(aline, "&") + 1);
31.2053 + for (s = strstr(aline, "&"); s < strstr(aline, ";"); s++)
31.2054 + if (*s == CHAR_SPACE) i = 0; /* 0.99 don't report "Jones & Son;" */
31.2055 + if (i > 0) {
31.2056 + strncpy(wrk, strstr(aline,"&"), i);
31.2057 + wrk[i] = 0;
31.2058 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", aline);
31.2059 + if (!pswit[OVERVIEW_SWITCH])
31.2060 + printf(" Line %ld column %d - HTML symbol? %s \n", linecnt, (int)(strstr(aline, "&") - aline) + 1, wrk);
31.2061 + else
31.2062 + cnt_html++;
31.2063 + }
31.2064 + }
31.2065 +
31.2066 + /* At end of paragraph, check for mismatched quotes. */
31.2067 + /* We don't want to report an error immediately, since it is a */
31.2068 + /* common convention to omit the quotes at end of paragraph if */
31.2069 + /* the next paragraph is a continuation of the same speaker. */
31.2070 + /* Where this is the case, the next para should begin with a */
31.2071 + /* quote, so we store the warning message and only display it */
31.2072 + /* at the top of the next iteration if the new para doesn't */
31.2073 + /* start with a quote. */
31.2074 + /* The -p switch overrides this default, and warns of unclosed */
31.2075 + /* quotes on _every_ paragraph, whether the next begins with a */
31.2076 + /* quote or not. */
31.2077 + /* Version .16 - only report mismatched single quotes if */
31.2078 + /* an open_single_quotes was found. */
31.2079 +
31.2080 + if (isemptyline) { /* end of para - add up the totals */
31.2081 + if (quot % 2)
31.2082 + sprintf(dquote_err, " Line %ld - Mismatched quotes\n", linecnt);
31.2083 + if (pswit[SQUOTE_SWITCH] && open_single_quote && (open_single_quote != close_single_quote) )
31.2084 + sprintf(squote_err," Line %ld - Mismatched singlequotes?\n", linecnt);
31.2085 + if (pswit[SQUOTE_SWITCH] && open_single_quote
31.2086 + && (open_single_quote != close_single_quote)
31.2087 + && (open_single_quote != close_single_quote +1) )
31.2088 + squot = 1; /* flag it to be noted regardless of the first char of the next para */
31.2089 + if (r_brack)
31.2090 + sprintf(rbrack_err, " Line %ld - Mismatched round brackets?\n", linecnt);
31.2091 + if (s_brack)
31.2092 + sprintf(sbrack_err, " Line %ld - Mismatched square brackets?\n", linecnt);
31.2093 + if (c_brack)
31.2094 + sprintf(cbrack_err, " Line %ld - Mismatched curly brackets?\n", linecnt);
31.2095 + if (c_unders % 2)
31.2096 + sprintf(unders_err, " Line %ld - Mismatched underscores?\n", linecnt);
31.2097 + quot = s_brack = c_brack = r_brack = c_unders =
31.2098 + open_single_quote = close_single_quote = 0;
31.2099 + isnewpara = 1; /* let the next iteration know that it's starting a new para */
31.2100 + }
31.2101 +
31.2102 + /* V.21 _ALSO_ at end of paragraph, check for omitted punctuation. */
31.2103 + /* by working back through prevline. DW. */
31.2104 + /* Hmmm. Need to check this only for "normal" paras. */
31.2105 + /* So what is a "normal" para? ouch! */
31.2106 + /* Not normal if one-liner (chapter headings, etc.) */
31.2107 + /* Not normal if doesn't contain at least one locase letter */
31.2108 + /* Not normal if starts with space */
31.2109 +
31.2110 + /* 0.99 tighten up on para end checks. Disallow comma and */
31.2111 + /* semi-colon. Check for legit para end before quotes. */
31.2112 + if (isemptyline) { /* end of para */
31.2113 + for (s = prevline, i = 0; *s && !i; s++)
31.2114 + if (gcisletter(*s))
31.2115 + i = 1; /* use i to indicate the presence of a letter on the line */
31.2116 + /* This next "if" is a problem. */
31.2117 + /* If I say "start_para_line <= linecnt - 1", that includes one-line */
31.2118 + /* "paragraphs" like chapter heads. Lotsa false positives. */
31.2119 + /* If I say "start_para_line < linecnt - 1" it doesn't, but then it */
31.2120 + /* misses genuine one-line paragraphs. */
31.2121 + /* So what do I do? */
31.2122 + if (i
31.2123 + && lastblen > 2
31.2124 + && start_para_line < linecnt - 1
31.2125 + && *prevline > CHAR_SPACE
31.2126 + ) {
31.2127 + for (i = strlen(prevline)-1; (prevline[i] == CHAR_DQUOTE || prevline[i] == CHAR_SQUOTE) && prevline[i] > CHAR_SPACE && i > 0; i--);
31.2128 + for ( ; i > 0; i--) {
31.2129 + if (gcisalpha(prevline[i])) {
31.2130 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", prevline);
31.2131 + if (!pswit[OVERVIEW_SWITCH])
31.2132 + printf(" Line %ld column %d - No punctuation at para end?\n", linecnt-1, strlen(prevline));
31.2133 + else
31.2134 + cnt_punct++;
31.2135 + break;
31.2136 + }
31.2137 + if (strchr("-.:!([{?}])", prevline[i]))
31.2138 + break;
31.2139 + }
31.2140 + }
31.2141 + }
31.2142 + strcpy(prevline, aline);
31.2143 + }
31.2144 + fclose (infile);
31.2145 + if (!pswit[OVERVIEW_SWITCH])
31.2146 + for (i = 0; i < MAX_QWORD; i++)
31.2147 + if (dupcnt[i])
31.2148 + printf("\nNote: Queried word %s was duplicated %d time%s\n", qword[i], dupcnt[i], "s");
31.2149 +}
31.2150 +
31.2151 +
31.2152 +
31.2153 +/* flgets - get one line from the input stream, checking for */
31.2154 +/* the existence of exactly one CR/LF line-end per line. */
31.2155 +/* Returns a pointer to the line. */
31.2156 +
31.2157 +char *flgets(char *theline, int maxlen, FILE *thefile, long lcnt)
31.2158 +{
31.2159 + char c;
31.2160 + int len, isCR, cint;
31.2161 +
31.2162 + *theline = 0;
31.2163 + len = isCR = 0;
31.2164 + c = cint = fgetc(thefile);
31.2165 + do {
31.2166 + if (cint == EOF)
31.2167 + return (NULL);
31.2168 + if (c == 10) /* either way, it's end of line */
31.2169 + if (isCR)
31.2170 + break;
31.2171 + else { /* Error - a LF without a preceding CR */
31.2172 + if (pswit[LINE_END_SWITCH]) {
31.2173 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
31.2174 + if (!pswit[OVERVIEW_SWITCH])
31.2175 + printf(" Line %ld - No CR?\n", lcnt);
31.2176 + else
31.2177 + cnt_lineend++;
31.2178 + }
31.2179 + break;
31.2180 + }
31.2181 + if (c == 13) {
31.2182 + if (isCR) { /* Error - two successive CRs */
31.2183 + if (pswit[LINE_END_SWITCH]) {
31.2184 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
31.2185 + if (!pswit[OVERVIEW_SWITCH])
31.2186 + printf(" Line %ld - Two successive CRs?\n", lcnt);
31.2187 + else
31.2188 + cnt_lineend++;
31.2189 + }
31.2190 + }
31.2191 + isCR = 1;
31.2192 + }
31.2193 + else {
31.2194 + if (pswit[LINE_END_SWITCH] && isCR) {
31.2195 + if (pswit[ECHO_SWITCH]) printf("\n%s\n", theline);
31.2196 + if (!pswit[OVERVIEW_SWITCH])
31.2197 + printf(" Line %ld column %d - CR without LF?\n", lcnt, len+1);
31.2198 + else
31.2199 + cnt_lineend++;
31.2200 + }
31.2201 + theline[len] = c;
31.2202 + len++;
31.2203 + theline[len] = 0;
31.2204 + isCR = 0;
31.2205 + }
31.2206 + c = cint = fgetc(thefile);
31.2207 + } while(len < maxlen);
31.2208 + if (pswit[MARKUP_SWITCH])
31.2209 + postprocess_for_HTML(theline);
31.2210 + if (pswit[DP_SWITCH])
31.2211 + postprocess_for_DP(theline);
31.2212 + return(theline);
31.2213 +}
31.2214 +
31.2215 +
31.2216 +
31.2217 +
31.2218 +/* mixdigit - takes a "word" as a parameter, and checks whether it */
31.2219 +/* contains a mixture of alpha and digits. Generally, this is an */
31.2220 +/* error, but may not be for cases like 4th or L5 12s. 3d. */
31.2221 +/* Returns 0 if no error found, 1 if error. */
31.2222 +
31.2223 +int mixdigit(char *checkword) /* check for digits like 1 or 0 in words */
31.2224 +{
31.2225 + int wehaveadigit, wehavealetter, firstdigits, query, wl;
31.2226 + char *s;
31.2227 +
31.2228 +
31.2229 + wehaveadigit = wehavealetter = query = 0;
31.2230 + for (s = checkword; *s; s++)
31.2231 + if (gcisalpha(*s))
31.2232 + wehavealetter = 1;
31.2233 + else
31.2234 + if (gcisdigit(*s))
31.2235 + wehaveadigit = 1;
31.2236 + if (wehaveadigit && wehavealetter) { /* Now exclude common legit cases, like "21st" and "12l. 3s. 11d." */
31.2237 + query = 1;
31.2238 + wl = strlen(checkword);
31.2239 + for (firstdigits = 0; gcisdigit(checkword[firstdigits]); firstdigits++)
31.2240 + ;
31.2241 + /* digits, ending in st, rd, nd, th of either case */
31.2242 + /* 0.99 donovan points out an error below. Turns out */
31.2243 + /* I was using matchword like strcmp when the */
31.2244 + /* return values are different! Duh. */
31.2245 + if (firstdigits + 2 == wl &&
31.2246 + (matchword(checkword + wl - 2, "st")
31.2247 + || matchword(checkword + wl - 2, "rd")
31.2248 + || matchword(checkword + wl - 2, "nd")
31.2249 + || matchword(checkword + wl - 2, "th"))
31.2250 + )
31.2251 + query = 0;
31.2252 + if (firstdigits + 3 == wl &&
31.2253 + (matchword(checkword + wl - 3, "sts")
31.2254 + || matchword(checkword + wl - 3, "rds")
31.2255 + || matchword(checkword + wl - 3, "nds")
31.2256 + || matchword(checkword + wl - 3, "ths"))
31.2257 + )
31.2258 + query = 0;
31.2259 + if (firstdigits + 3 == wl &&
31.2260 + (matchword(checkword + wl - 4, "stly")
31.2261 + || matchword(checkword + wl - 4, "rdly")
31.2262 + || matchword(checkword + wl - 4, "ndly")
31.2263 + || matchword(checkword + wl - 4, "thly"))
31.2264 + )
31.2265 + query = 0;
31.2266 +
31.2267 + /* digits, ending in l, L, s or d */
31.2268 + if (firstdigits + 1 == wl &&
31.2269 + (checkword[wl-1] == 'l'
31.2270 + || checkword[wl-1] == 'L'
31.2271 + || checkword[wl-1] == 's'
31.2272 + || checkword[wl-1] == 'd'))
31.2273 + query = 0;
31.2274 + /* L at the start of a number, representing Britsh pounds, like L500 */
31.2275 + /* This is cute. We know the current word is mixeddigit. If the first */
31.2276 + /* letter is L, there must be at least one digit following. If both */
31.2277 + /* digits and letters follow, we have a genuine error, else we have a */
31.2278 + /* capital L followed by digits, and we accept that as a non-error. */
31.2279 + if (checkword[0] == 'L')
31.2280 + if (!mixdigit(checkword+1))
31.2281 + query = 0;
31.2282 + }
31.2283 + return (query);
31.2284 +}
31.2285 +
31.2286 +
31.2287 +
31.2288 +
31.2289 +/* getaword - extracts the first/next "word" from the line, and puts */
31.2290 +/* it into "thisword". A word is defined as one English word unit */
31.2291 +/* -- or at least that's what I'm trying for. */
31.2292 +/* Returns a pointer to the position in the line where we will start */
31.2293 +/* looking for the next word. */
31.2294 +
31.2295 +char *getaword(char *fromline, char *thisword)
31.2296 +{
31.2297 + int i, wordlen;
31.2298 + char *s;
31.2299 +
31.2300 + wordlen = 0;
31.2301 + for ( ; !gcisdigit(*fromline) && !gcisalpha(*fromline) && *fromline ; fromline++ );
31.2302 +
31.2303 + /* V .20 */
31.2304 + /* add a look-ahead to handle exceptions for numbers like 1,000 and 1.35. */
31.2305 + /* Especially yucky is the case of L1,000 */
31.2306 + /* I hate this, and I see other ways, but I don't see that any is _better_.*/
31.2307 + /* This section looks for a pattern of characters including a digit */
31.2308 + /* followed by a comma or period followed by one or more digits. */
31.2309 + /* If found, it returns this whole pattern as a word; otherwise we discard */
31.2310 + /* the results and resume our normal programming. */
31.2311 + s = fromline;
31.2312 + for ( ; (gcisdigit(*s) || gcisalpha(*s) || *s == ',' || *s == '.') && wordlen < MAXWORDLEN ; s++ ) {
31.2313 + thisword[wordlen] = *s;
31.2314 + wordlen++;
31.2315 + }
31.2316 + thisword[wordlen] = 0;
31.2317 + for (i = 1; i < wordlen -1; i++) {
31.2318 + if (thisword[i] == '.' || thisword[i] == ',') {
31.2319 + if (gcisdigit(thisword[i-1]) && gcisdigit(thisword[i-1])) { /* we have one of the damned things */
31.2320 + fromline = s;
31.2321 + return(fromline);
31.2322 + }
31.2323 + }
31.2324 + }
31.2325 +
31.2326 + /* we didn't find a punctuated number - do the regular getword thing */
31.2327 + wordlen = 0;
31.2328 + for ( ; (gcisdigit(*fromline) || gcisalpha(*fromline) || *fromline == '\'') && wordlen < MAXWORDLEN ; fromline++ ) {
31.2329 + thisword[wordlen] = *fromline;
31.2330 + wordlen++;
31.2331 + }
31.2332 + thisword[wordlen] = 0;
31.2333 + return(fromline);
31.2334 +}
31.2335 +
31.2336 +
31.2337 +
31.2338 +
31.2339 +
31.2340 +/* matchword - just a case-insensitive string matcher */
31.2341 +/* yes, I know this is not efficient. I'll worry about */
31.2342 +/* that when I have a clear idea where I'm going with it.*/
31.2343 +
31.2344 +int matchword(char *checkfor, char *thisword)
31.2345 +{
31.2346 + unsigned int ismatch, i;
31.2347 +
31.2348 + if (strlen(checkfor) != strlen(thisword)) return(0);
31.2349 +
31.2350 + ismatch = 1; /* assume a match until we find a difference */
31.2351 + for (i = 0; i <strlen(checkfor); i++)
31.2352 + if (toupper(checkfor[i]) != toupper(thisword[i]))
31.2353 + ismatch = 0;
31.2354 + return (ismatch);
31.2355 +}
31.2356 +
31.2357 +
31.2358 +
31.2359 +
31.2360 +
31.2361 +/* lowerit - lowercase the line. Yes, strlwr does the same job, */
31.2362 +/* but not on all platforms, and I'm a bit paranoid about what */
31.2363 +/* some implementations of tolower might do to hi-bit characters,*/
31.2364 +/* which shouldn't matter, but better safe than sorry. */
31.2365 +
31.2366 +void lowerit(char *theline)
31.2367 +{
31.2368 + for ( ; *theline; theline++)
31.2369 + if (*theline >='A' && *theline <='Z')
31.2370 + *theline += 32;
31.2371 +}
31.2372 +
31.2373 +
31.2374 +/* Is this word a Roman Numeral? */
31.2375 +/* v 0.99 improved to be better. It still doesn't actually */
31.2376 +/* validate that the number is a valid Roman Numeral -- for example */
31.2377 +/* it will pass MXXXXXXXXXX as a valid Roman Numeral, but that's not*/
31.2378 +/* what we're here to do. If it passes this, it LOOKS like a Roman */
31.2379 +/* numeral. Anyway, the actual Romans were pretty tolerant of bad */
31.2380 +/* arithmetic, or expressions thereof, except when it came to taxes.*/
31.2381 +/* Allow any number of M, an optional D, an optional CM or CD, */
31.2382 +/* any number of optional Cs, an optional XL or an optional XC, an */
31.2383 +/* optional IX or IV, an optional V and any number of optional Is. */
31.2384 +/* Good enough for jazz chords. */
31.2385 +
31.2386 +int isroman(char *t)
31.2387 +{
31.2388 + char *s;
31.2389 +
31.2390 + if (!t || !*t) return (0);
31.2391 +
31.2392 + s = t;
31.2393 +
31.2394 + while (*t == 'm' && *t ) t++;
31.2395 + if (*t == 'd') t++;
31.2396 + if (*t == 'c' && *(t+1) == 'm') t+=2;
31.2397 + if (*t == 'c' && *(t+1) == 'd') t+=2;
31.2398 + while (*t == 'c' && *t) t++;
31.2399 + if (*t == 'x' && *(t+1) == 'l') t+=2;
31.2400 + if (*t == 'x' && *(t+1) == 'c') t+=2;
31.2401 + if (*t == 'l') t++;
31.2402 + while (*t == 'x' && *t) t++;
31.2403 + if (*t == 'i' && *(t+1) == 'x') t+=2;
31.2404 + if (*t == 'i' && *(t+1) == 'v') t+=2;
31.2405 + if (*t == 'v') t++;
31.2406 + while (*t == 'i' && *t) t++;
31.2407 + if (!*t) return (1);
31.2408 +
31.2409 + return(0);
31.2410 +}
31.2411 +
31.2412 +
31.2413 +
31.2414 +
31.2415 +/* gcisalpha is a special version that is somewhat lenient on 8-bit texts. */
31.2416 +/* If we use the standard isalpha() function, 8-bit accented characters break */
31.2417 +/* words, so that tete with accented characters appears to be two words, "t" */
31.2418 +/* and "t", with 8-bit characters between them. This causes over-reporting of */
31.2419 +/* errors. gcisalpha() recognizes accented letters from the CP1252 (Windows) */
31.2420 +/* and ISO-8859-1 character sets, which are the most common PG 8-bit types. */
31.2421 +
31.2422 +int gcisalpha(unsigned char c)
31.2423 +{
31.2424 + if (c >='a' && c <='z') return(1);
31.2425 + if (c >='A' && c <='Z') return(1);
31.2426 + if (c < 140) return(0);
31.2427 + if (c >=192 && c != 208 && c != 215 && c != 222 && c != 240 && c != 247 && c != 254) return(1);
31.2428 + if (c == 140 || c == 142 || c == 156 || c == 158 || c == 159) return (1);
31.2429 + return(0);
31.2430 +}
31.2431 +
31.2432 +/* gcisdigit is a special version that doesn't get confused in 8-bit texts. */
31.2433 +int gcisdigit(unsigned char c)
31.2434 +{
31.2435 + if (c >= '0' && c <='9') return(1);
31.2436 + return(0);
31.2437 +}
31.2438 +
31.2439 +/* gcisletter is a special version that doesn't get confused in 8-bit texts. */
31.2440 +/* Yeah, we're ISO-8891-1-specific. So sue me. */
31.2441 +int gcisletter(unsigned char c)
31.2442 +{
31.2443 + if ((c >= 'A' && c <='Z') || (c >= 'a' && c <='z') || c >= 192) return(1);
31.2444 + return(0);
31.2445 +}
31.2446 +
31.2447 +
31.2448 +
31.2449 +
31.2450 +/* gcstrchr wraps strchr to return NULL if the character being searched for is zero */
31.2451 +
31.2452 +char *gcstrchr(char *s, char c)
31.2453 +{
31.2454 + if (c == 0) return(NULL);
31.2455 + return(strchr(s,c));
31.2456 +}
31.2457 +
31.2458 +/* postprocess_for_DP is derived from postprocess_for_HTML */
31.2459 +/* It is invoked with the -d switch from flgets(). */
31.2460 +/* It simply "removes" from the line a hard-coded set of common */
31.2461 +/* DP-specific tags, so that the line passed to the main routine has*/
31.2462 +/* been pre-cleaned of DP markup. */
31.2463 +
31.2464 +void postprocess_for_DP(char *theline)
31.2465 +{
31.2466 +
31.2467 + char *s, *t;
31.2468 + int i;
31.2469 +
31.2470 + if (!*theline)
31.2471 + return;
31.2472 +
31.2473 + for (i = 0; *DPmarkup[i]; i++) {
31.2474 + s = strstr(theline, DPmarkup[i]);
31.2475 + while (s) {
31.2476 + t = s + strlen(DPmarkup[i]);
31.2477 + while (*t) {
31.2478 + *s = *t;
31.2479 + t++; s++;
31.2480 + }
31.2481 + *s = 0;
31.2482 + s = strstr(theline, DPmarkup[i]);
31.2483 + }
31.2484 + }
31.2485 +
31.2486 +}
31.2487 +
31.2488 +
31.2489 +/* postprocess_for_HTML is, at the moment (0.97), a very nasty */
31.2490 +/* short-term fix for Charlz. Nasty, nasty, nasty. */
31.2491 +/* It is invoked with the -m switch from flgets(). */
31.2492 +/* It simply "removes" from the line a hard-coded set of common */
31.2493 +/* HTML tags and "replaces" a hard-coded set of common HTML */
31.2494 +/* entities, so that the line passed to the main routine has */
31.2495 +/* been pre-cleaned of HTML. This is _so_ not the right way to */
31.2496 +/* deal with HTML, but what Charlz needs now is not HTML handling */
31.2497 +/* proper: just ignoring <i> tags and some others. */
31.2498 +/* To be revisited in future releases! */
31.2499 +
31.2500 +void postprocess_for_HTML(char *theline)
31.2501 +{
31.2502 +
31.2503 + if (strstr(theline, "<") && strstr(theline, ">"))
31.2504 + while (losemarkup(theline))
31.2505 + ;
31.2506 + while (loseentities(theline))
31.2507 + ;
31.2508 +}
31.2509 +
31.2510 +char *losemarkup(char *theline)
31.2511 +{
31.2512 + char *s, *t;
31.2513 + int i;
31.2514 +
31.2515 + if (!*theline)
31.2516 + return(NULL);
31.2517 +
31.2518 + s = strstr(theline, "<");
31.2519 + t = strstr(theline, ">");
31.2520 + if (!s || !t) return(NULL);
31.2521 + for (i = 0; *markup[i]; i++)
31.2522 + if (!tagcomp(s+1, markup[i])) {
31.2523 + if (!*(t+1)) {
31.2524 + *s = 0;
31.2525 + return(s);
31.2526 + }
31.2527 + else
31.2528 + if (t > s) {
31.2529 + strcpy(s, t+1);
31.2530 + return(s);
31.2531 + }
31.2532 + }
31.2533 + /* it's an unrecognized <xxx> */
31.2534 + return(NULL);
31.2535 +}
31.2536 +
31.2537 +char *loseentities(char *theline)
31.2538 +{
31.2539 + int i;
31.2540 + char *s, *t;
31.2541 +
31.2542 + if (!*theline)
31.2543 + return(NULL);
31.2544 +
31.2545 + for (i = 0; *entities[i].htmlent; i++) {
31.2546 + s = strstr(theline, entities[i].htmlent);
31.2547 + if (s) {
31.2548 + t = malloc((size_t)strlen(s));
31.2549 + if (!t) return(NULL);
31.2550 + strcpy(t, s + strlen(entities[i].htmlent));
31.2551 + strcpy(s, entities[i].textent);
31.2552 + strcat(s, t);
31.2553 + free(t);
31.2554 + return(theline);
31.2555 + }
31.2556 + }
31.2557 +
31.2558 + /* V0.97 Duh. Forgot to check the htmlnum member */
31.2559 + for (i = 0; *entities[i].htmlnum; i++) {
31.2560 + s = strstr(theline, entities[i].htmlnum);
31.2561 + if (s) {
31.2562 + t = malloc((size_t)strlen(s));
31.2563 + if (!t) return(NULL);
31.2564 + strcpy(t, s + strlen(entities[i].htmlnum));
31.2565 + strcpy(s, entities[i].textent);
31.2566 + strcat(s, t);
31.2567 + free(t);
31.2568 + return(theline);
31.2569 + }
31.2570 + }
31.2571 + return(NULL);
31.2572 +}
31.2573 +
31.2574 +
31.2575 +int tagcomp(char *strin, char *basetag)
31.2576 +{
31.2577 + char *s, *t;
31.2578 +
31.2579 + s = basetag;
31.2580 + t = strin;
31.2581 + if (*t == '/') t++; /* ignore a slash */
31.2582 + while (*s && *t) {
31.2583 + if (tolower(*s) != tolower(*t)) return(1);
31.2584 + s++; t++;
31.2585 + }
31.2586 + /* OK, we have < followed by a valid tag start */
31.2587 + /* should I do something about length? */
31.2588 + /* this is messy. The length of an <i> tag is */
31.2589 + /* limited, but a <table> could go on for miles */
31.2590 + /* so I'd have to parse the tags . . . ugh. */
31.2591 + /* It isn't what Charlz needs now, so mark it */
31.2592 + /* as 'pending'. */
31.2593 + return(0);
31.2594 +}
31.2595 +
31.2596 +void proghelp() /* explain program usage here */
31.2597 +{
31.2598 + fputs("V. 0.991. Copyright 2000-2005 Jim Tinsley <jtinsley@pobox.com>.\n",stderr);
31.2599 + fputs("Gutcheck comes wih ABSOLUTELY NO WARRANTY. For details, read the file COPYING.\n", stderr);
31.2600 + fputs("This is Free Software; you may redistribute it under certain conditions (GPL);\n", stderr);
31.2601 + fputs("read the file COPYING for details.\n\n", stderr);
31.2602 + fputs("Usage is: gutcheck [-setpxloyhud] filename\n",stderr);
31.2603 + fputs(" where -s checks single quotes, -e suppresses echoing lines, -t checks typos\n",stderr);
31.2604 + fputs(" -x (paranoid) switches OFF -t and extra checks, -l turns OFF line-end checks\n",stderr);
31.2605 + fputs(" -o just displays overview without detail, -h echoes header fields\n",stderr);
31.2606 + fputs(" -v (verbose) unsuppresses duplicate reporting, -m suppresses markup\n",stderr);
31.2607 + fputs(" -d ignores DP-specific markup,\n",stderr);
31.2608 + fputs(" -u uses a file gutcheck.typ to query user-defined possible typos\n",stderr);
31.2609 + fputs("Sample usage: gutcheck warpeace.txt \n",stderr);
31.2610 + fputs("\n",stderr);
31.2611 + fputs("Gutcheck looks for errors in Project Gutenberg(TM) etexts.\n", stderr);
31.2612 + fputs("Gutcheck queries anything it thinks shouldn't be in a PG text; non-ASCII\n",stderr);
31.2613 + fputs("characters like accented letters, lines longer than 75 or shorter than 55,\n",stderr);
31.2614 + fputs("unbalanced quotes or brackets, a variety of badly formatted punctuation, \n",stderr);
31.2615 + fputs("HTML tags, some likely typos. It is NOT a substitute for human judgement.\n",stderr);
31.2616 + fputs("\n",stderr);
31.2617 +}
31.2618 +
31.2619 +
31.2620 +
31.2621 +/*********************************************************************
31.2622 + Revision History:
31.2623 +
31.2624 + 04/22/01 Cleaned up some stuff and released .10
31.2625 +
31.2626 + ---------------
31.2627 +
31.2628 + 05/09/01 Added the typo list, added two extra cases of he/be error,
31.2629 + added -p switch, OPEN_SINGLE QUOTE char as .11
31.2630 +
31.2631 + ---------------
31.2632 +
31.2633 + 05/20/01 Increased the typo list,
31.2634 + added paranoid mode,
31.2635 + ANSIfied the code and added some casts
31.2636 + so the compiler wouldn't keep asking if I knew what I was doing,
31.2637 + fixed bug in l.s.d. condition (thanks, Dave!),
31.2638 + standardized spacing when echoing,
31.2639 + added letter-combo checking code to typo section,
31.2640 + added more h/b words to typo array.
31.2641 + Not too sure about putting letter combos outside of the TYPO conditions -
31.2642 + someone is sure to have a book about the tbaka tribe, or something. Anyway, let's see.
31.2643 + Released as .12
31.2644 +
31.2645 + ---------------
31.2646 +
31.2647 + 06/01/01 Removed duplicate reporting of Tildes, asterisks, etc.
31.2648 + 06/10/01 Added flgets routine to help with platform-independent
31.2649 + detection of invalid line-ends. All PG text files should
31.2650 + have CR/LF (13/10) at end of line, regardless of system.
31.2651 + Gutcheck now validates this by default. (Thanks, Charles!)
31.2652 + Released as .13
31.2653 +
31.2654 + ---------------
31.2655 +
31.2656 + 06/11/01 Added parenthesis match checking. (c_brack, cbrack_err etc.)
31.2657 + Released as .14
31.2658 +
31.2659 + ---------------
31.2660 +
31.2661 + 06/23/01 Fixed: 'No',he said. not being flagged.
31.2662 +
31.2663 + Improved: better single-quotes checking:
31.2664 +
31.2665 + Ignore singlequotes surrounded by alpha, like didn't. (was OK)
31.2666 +
31.2667 + If a singlequote is at the END of a word AND the word ends in "s":
31.2668 + The dogs' tails wagged.
31.2669 + it's probably an apostrophe, but less commonly may be a closequote:
31.2670 + "These 'pack dogs' of yours look more like wolves."
31.2671 +
31.2672 + If it's got punctuation before it and is followed by a space
31.2673 + or punctuation:
31.2674 + . . . was a problem,' he said
31.2675 + . . . was a problem,'"
31.2676 + it is probably (certainly?) a closequote.
31.2677 +
31.2678 + If it's at start of paragraph, it's probably an openquote.
31.2679 + (but watch dialect)
31.2680 +
31.2681 + Words with ' at beginning and end are probably quoted:
31.2682 + "You have the word 'chivalry' frequently on your lips."
31.2683 + (Not specifically implemented)
31.2684 + V.18 I'm glad I didn't implement this, 'cos it jest ain't so
31.2685 + where the convention is to punctuate outside the quotes.
31.2686 + 'Come', he said, 'and join the party'.
31.2687 +
31.2688 + If it is followed by an alpha, and especially a capital:
31.2689 + 'Hello,' called he.
31.2690 + it is either an openquote or dialect.
31.2691 +
31.2692 + Dialect breaks ALL the rules:
31.2693 + A man's a man for a' that.
31.2694 + "Aye, but 'tis all in the pas' now."
31.2695 + "'Tis often the way," he said.
31.2696 + 'Ave a drink on me.
31.2697 +
31.2698 + This version looks to be an improvement, and produces
31.2699 + fewer false positives, but is still not perfect. The
31.2700 + 'pack dogs' case still fools it, and dialect is still
31.2701 + a problem. Oh, well, it's an improvement, and I have
31.2702 + a weighted structure in place for refining guesses at
31.2703 + closequotes. Maybe next time, I'll add a bit of logic
31.2704 + where if there is an open quote and one that was guessed
31.2705 + to be a possessive apostrophe after s, I'll re-guess it
31.2706 + to be a closequote. Let's see how this one flies, first.
31.2707 +
31.2708 + (Afterview: it's still crap. Needs much work, and a deeper insight.)
31.2709 +
31.2710 + Released as .15
31.2711 +
31.2712 + TODO: More he/be checks. Can't be perfect - counterexamples:
31.2713 + I gave my son good advice: be married regardless of the world's opinion.
31.2714 + I gave my son good advice: he married regardless of the world's opinion.
31.2715 +
31.2716 + If by "primitive" be meant "crude", we can understand the sentence.
31.2717 + If by "primitive" he meant "crude", we can understand the sentence.
31.2718 +
31.2719 + No matter what be said, I must go on.
31.2720 + No matter what he said, I must go on.
31.2721 +
31.2722 + No value, however great, can be set upon them.
31.2723 + No value, however great, can he set upon them.
31.2724 +
31.2725 + Real-Life one from a DP International Weekly Miscellany:
31.2726 + He wandered through the forest without fear, sleeping
31.2727 + much, for in sleep be had companionship--the Great
31.2728 + Spirit teaching him what he should know in dreams.
31.2729 + That one found by jeebies, and it turned out to be "he".
31.2730 +
31.2731 +
31.2732 + ---------------
31.2733 +
31.2734 + 07/01/01 Added -O option.
31.2735 + Improved singlequotes by reporting mismatched single quotes
31.2736 + only if an open_single_quotes was found.
31.2737 +
31.2738 + Released as .16
31.2739 +
31.2740 + ---------------
31.2741 +
31.2742 + 08/27/01 Added -Y switch for Robert Rowe to allow his app to
31.2743 + catch the error output.
31.2744 +
31.2745 + Released as .17
31.2746 +
31.2747 + ---------------
31.2748 +
31.2749 + 09/08/01 Added checking Capitals at start of paragraph, but not
31.2750 + checking them at start of sentence.
31.2751 +
31.2752 + TODO: Parse sentences out so can check reliably for start of
31.2753 + sentence. Need a whole different approach for that.
31.2754 + (Can't just rely on periods, since they are also
31.2755 + used for abbreviations, etc.)
31.2756 +
31.2757 + Added checking for all vowels or all consonants in a word.
31.2758 +
31.2759 + While I was in, I added "ii" checking and "tl" at start of word.
31.2760 +
31.2761 + Added echoing of first line of paragraph when reporting
31.2762 + mismatched quoted or brackets (thanks to David Widger for the
31.2763 + suggestion)
31.2764 +
31.2765 + Not querying L at start of a number (used for British pounds).
31.2766 +
31.2767 + The spelling changes are sort of half-done but released anyway
31.2768 + Skipped .18 because I had given out a couple of test versions
31.2769 + with that number.
31.2770 +
31.2771 + 09/25/01 Released as .19
31.2772 +
31.2773 + ---------------
31.2774 +
31.2775 + TODO:
31.2776 + Use the logic from my new version of safewrap to stop querying
31.2777 + short lines like poems and TOCs.
31.2778 + Ignore non-standard ellipses like . . . or ...
31.2779 +
31.2780 +
31.2781 + ---------------
31.2782 + 10/01/01 Made any line over 80 a VERY long line (was 85).
31.2783 + Recognized openquotes on indented paragraphs as continuations
31.2784 + of the same speech.
31.2785 + Added "cf" to the okword list (how did I forget _that_?) and a few others.
31.2786 + Moved abbrev to okword and made it more general.
31.2787 + Removed requirement that PG_space_emdash be greater than
31.2788 + ten before turning off warnings about spaced dashes.
31.2789 + Added period to list of characters that might constitute a separator line.
31.2790 + Now checking for double punctuation (Thanks, David!)
31.2791 + Now if two spaced em-dashes on a line, reports both. (DW)
31.2792 + Bug: Wasn't catching spaced punctuation at line-end since I
31.2793 + added flgets in version .13 - fixed.
31.2794 + Bug: Wasn't catching spaced singlequotes - fixed
31.2795 + Now reads punctuated numbers like 1,000 as a single word.
31.2796 + (Used to give "standalone 1" type queries)
31.2797 + Changed paranoid mode - not including s and p options. -ex is now quite usable.
31.2798 + Bug: was calling `"For it is perfectly impossible," Unspaced Quotes - fixed
31.2799 + Bug: Sometimes gave _next_ line number for queried word at end of line - fixed
31.2800 +
31.2801 + 10/22/01 Released as .20
31.2802 +
31.2803 + ---------------
31.2804 +
31.2805 + Added count of lines with spaces at end. (cnt_spacend) (Thanks, Brett!)
31.2806 + Reduced the number of hi-bit letters needed to stop reporting them
31.2807 + from 1/20 to 1/100 or 200 in total.
31.2808 + Added PG footer check.
31.2809 + Added the -h switch.
31.2810 + Fixed platform-specific CHAR_EOL checking for isemptyline - changed to 13 and 10
31.2811 + Not reporting ".," when there are many of them, such as a book with many references to "Vol 1., p. 23"
31.2812 + Added unspaced brackets check when surrounded by alpha.
31.2813 + Removed all typo reporting unless the typo switch is on.
31.2814 + Added gcisalpha to ease over-reporting of 8-bit queries.
31.2815 + ECHO_SWITCH is now ON by default!
31.2816 + PARANOID_SWITCH is now ON by default!
31.2817 + Checking for ">From" placed there by e-mail MTA (Thanks Andrew & Greg)
31.2818 + Checking for standalone lowercase "l"
31.2819 + Checking for standalone lowercase "s"
31.2820 + Considering "is be" and "be is" "be was" "was be" as he/be errors
31.2821 + Looking at punct at end of para
31.2822 +
31.2823 + 01/20/02 Released as .21
31.2824 +
31.2825 + ---------------
31.2826 +
31.2827 + Added VERBOSE_SWITCH to make it list everything. (George Davis)
31.2828 +
31.2829 + ---------------
31.2830 +
31.2831 + 02/17/02 Added cint in flgets to try fix an EOF failure on a compiler I don't have.
31.2832 + after which
31.2833 + This line caused a coredump on Solaris - fixed.
31.2834 + Da sagte die Figur: " Das ist alles gar schoen, und man mag die Puppe
31.2835 + 03/09/02 Changed header recognition for another header change
31.2836 + Called it .24
31.2837 + 03/29/02 Added qword[][] so I can suppress massive overreporting
31.2838 + of queried "words" like "FN", "Wm.", "th'", people's
31.2839 + initials, chemical formulae and suchlike in some texts.
31.2840 + Called it .25
31.2841 + 04/07/02 The qword summary reports at end shouldn't show in OVERVIEW mode. Fixed.
31.2842 + Added linecounts in overview mode.
31.2843 + Wow! gutcheck gutcheck.exe doesn't report a binary! :-) Need to tighten up. Done.
31.2844 + "m" is a not uncommon scanno for "in", but also appears in "a.m." - Can I get round that?
31.2845 + 07/07/02 Added GPL.
31.2846 + Added checking for broken em-dash at line-end (enddash)
31.2847 + Released as 0.95
31.2848 + 08/17/02 Fixed a bug that treated some hi-bit characters as spaces. Thanks, Carlo.
31.2849 + Released as 0.96
31.2850 + 10/10/02 Suppressing some annoying multiple reports by default:
31.2851 + Standalone Ones, Asterisks, Square Brackets.
31.2852 + Digit 1 occurs often in many scientific texts.
31.2853 + Asterisk occurs often in multi-footnoted texts.
31.2854 + Mismatch Square Brackets occurs often in multi-para footnotes.
31.2855 + Added -m switch for Charlz. Horrible. Nasty. Kludgy. Evil.
31.2856 + . . . but it does more or less work for the main cases.
31.2857 + Removed uppercase within a word as a separate category so
31.2858 + that names like VanAllen get reported only once, like other
31.2859 + suspected typos.
31.2860 + 11/24/02 Fixed - -m switch wasn't looking at htmlnum in
31.2861 + loseentities (Thanks, Brett!)
31.2862 + Fixed bug which occasionally gave false warning of
31.2863 + paragraph starting with lowercase.
31.2864 + Added underscore as character not to query around doublequotes.
31.2865 + Split the "Non-ASCII" message into "Non-ASCII" vs. "Non-ISO-8859"
31.2866 + . . . this is to help detect things like CP1252 characters.
31.2867 + Released as 0.97
31.2868 +
31.2869 + 12/01/02 Hacked a simplified version of the "Wrongspaced quotes" out of gutspell,
31.2870 + for doublequotes only. Replaces "Spaced quote", since it also covers that
31.2871 + case.
31.2872 + Added "warn_hyphen" to ease over-reporting of hyphens.
31.2873 +
31.2874 + 12/20/02 Added "extra period" checks.
31.2875 + Added single character line check
31.2876 + Added I" check - is usually an exclam
31.2877 + Released as 0.98
31.2878 +
31.2879 + 1/5/03 Eeek! Left in a lowerit(argv[0]) at the start before procfile()
31.2880 + from when I was looking at ways to identify markup. Refuses to
31.2881 + open files for *nix users with upcase in the filemanes. Removed.
31.2882 + Fixed quickly and released as 0.981
31.2883 +
31.2884 + 1/8/03 Added "arid" to the list of typos, slightly against my better
31.2885 + judgement, but the DP gang are all excited about it. :-)
31.2886 + Added a check for comma followed by capital letter, where
31.2887 + a period has OCRed into a comma. (DW). Not sure about this
31.2888 + either; we'll see.
31.2889 + Compiling for Win32 to allow longfilenames.
31.2890 +
31.2891 + 6/1/04 A messy test release for DW to include the "gutcheck.typ"
31.2892 + process. And the gutcheck.jee trials. Removed "arid" --
31.2893 + it can go in gutcheck.typ
31.2894 +
31.2895 + Added checks for carats ^ and slants / but disabling slant
31.2896 + queries if more than 20 of them, because some people use them
31.2897 + for /italics/. Slants are commonly mistaken italic "I"s.
31.2898 +
31.2899 + Later: removed gutcheck.jee -- wrote jeebies instead.
31.2900 +
31.2901 +Random TODO:
31.2902 + Check brackets more closely, like quotes, so that it becomes
31.2903 + easy to find the error in long paragraphs full of brackets.
31.2904 +
31.2905 +
31.2906 + 11/4/04 Assorted cleanup. Fixed case where text started with an
31.2907 + unbalanced paragraph.
31.2908 +
31.2909 + 1/2/05 Has it really been that long? Added "nocomma", "noperiod" check.
31.2910 + Bits and pieces: improved isroman(). Added isletter().
31.2911 + Other stuff I never noted before this.
31.2912 +
31.2913 + 7/3/05 Stuck in a quick start on DP-markup ignoring
31.2914 + at BillFlis's suggestion.
31.2915 +
31.2916 + 1/23/06 Took out nocomma etc if typos are off. Why did I ever leave that in?
31.2917 + Don't count footer for dotcomma etc.
31.2918 +
31.2919 +
31.2920 +1 I
31.2921 +ail all
31.2922 +arc are
31.2923 +arid and
31.2924 +bad had
31.2925 +ball hall
31.2926 +band hand
31.2927 +bar her
31.2928 +bat but
31.2929 +be he
31.2930 +bead head
31.2931 +beads heads
31.2932 +bear hear
31.2933 +bit hit
31.2934 +bo be
31.2935 +boon been
31.2936 +borne home
31.2937 +bow how
31.2938 +bumbled humbled
31.2939 +car ear
31.2940 +carnage carriage
31.2941 +carne came
31.2942 +cast east
31.2943 +cat cut
31.2944 +cat eat
31.2945 +cheek check
31.2946 +clay day
31.2947 +coining coming
31.2948 +comer corner
31.2949 +die she
31.2950 +docs does
31.2951 +ease case
31.2952 +fail fall
31.2953 +fee he
31.2954 +haying having
31.2955 +ho he
31.2956 +ho who
31.2957 +hut but
31.2958 +is as
31.2959 +lie he
31.2960 +lime time
31.2961 +loth 10th
31.2962 +m in
31.2963 +modem modern
31.2964 +Ms his
31.2965 +ray away
31.2966 +ray my
31.2967 +ringer finger
31.2968 +ringers fingers
31.2969 +rioted noted
31.2970 +tho the
31.2971 +tie he
31.2972 +tie the
31.2973 +tier her
31.2974 +tight right
31.2975 +tile the
31.2976 +tiling thing
31.2977 +tip up
31.2978 +tram train
31.2979 +tune time
31.2980 +u "
31.2981 +wen well
31.2982 +yon you
31.2983 +
31.2984 +*********************************************************************/
31.2985 +
32.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
32.2 +++ b/gutcheck/gutcheck.typ.in Tue Jan 24 23:54:05 2012 +0000
32.3 @@ -0,0 +1,54 @@
32.4 +11
32.5 +44
32.6 +ms
32.7 +ail
32.8 +alien
32.9 +arc
32.10 +arid
32.11 +bar
32.12 +bat
32.13 +bo
32.14 +borne
32.15 +bow
32.16 +bum
32.17 +bumbled
32.18 +carnage
32.19 +carne
32.20 +cither
32.21 +coining
32.22 +comer
32.23 +cur
32.24 +docs
32.25 +eve
32.26 +eves
32.27 +gaming
32.28 +gram
32.29 +guru
32.30 +hag
32.31 +hare
32.32 +haying
32.33 +ho
32.34 +lime
32.35 +loth
32.36 +m
32.37 +modem
32.38 +nave
32.39 +ringer
32.40 +ringers
32.41 +riot
32.42 +rioted
32.43 +signer
32.44 +snore
32.45 +spam
32.46 +tho
32.47 +tier
32.48 +tile
32.49 +tiling
32.50 +tram
32.51 +tum
32.52 +tune
32.53 +u
32.54 +vas
32.55 +wag
32.56 +wen
32.57 +yon
33.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
33.2 +++ b/test/Makefile.am Tue Jan 24 23:54:05 2012 +0000
33.3 @@ -0,0 +1,1 @@
33.4 +SUBDIRS=harness compatibility .
34.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
34.2 +++ b/test/compatibility/Makefile.am Tue Jan 24 23:54:05 2012 +0000
34.3 @@ -0,0 +1,7 @@
34.4 +TESTS_ENVIRONMENT=GUTCHECK=../../gutcheck/gutcheck ../harness/gc-test
34.5 +TESTS=missing-space.tst spaced-punctuation.tst html-tag.tst html-symbol.tst \
34.6 + spaced-doublequote.tst mismatched-quotes.tst he-be.tst digits.tst \
34.7 + extra-period.tst ellipsis.tst short-line.tst abbreviation.tst \
34.8 + example.tst
34.9 +
34.10 +dist_pkgdata_DATA=$(TESTS)
35.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
35.2 +++ b/test/compatibility/abbreviation.tst Tue Jan 24 23:54:05 2012 +0000
35.3 @@ -0,0 +1,9 @@
35.4 +**************** INPUT ****************
35.5 +This period is an error.But the periods in a.m. aren't.
35.6 +**************** EXPECTED ****************
35.7 +
35.8 +This period is an error.But the periods in a.m. aren't.
35.9 + Line 1 column 45 - Query word m - not reporting duplicates
35.10 +
35.11 +This period is an error.But the periods in a.m. aren't.
35.12 + Line 1 column 24 - Missing space?
36.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
36.2 +++ b/test/compatibility/digits.tst Tue Jan 24 23:54:05 2012 +0000
36.3 @@ -0,0 +1,12 @@
36.4 +**************** INPUT ****************
36.5 +0K--this'11 make you look close1y.
36.6 +**************** EXPECTED ****************
36.7 +
36.8 +0K--this'11 make you look close1y.
36.9 + Line 1 column 1 - Query digit in 0K
36.10 +
36.11 +0K--this'11 make you look close1y.
36.12 + Line 1 column 3 - Query digit in this'11
36.13 +
36.14 +0K--this'11 make you look close1y.
36.15 + Line 1 column 26 - Query digit in close1y
37.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
37.2 +++ b/test/compatibility/ellipsis.tst Tue Jan 24 23:54:05 2012 +0000
37.3 @@ -0,0 +1,7 @@
37.4 +**************** INPUT ****************
37.5 +There are some complications . The extra space left around that
37.6 +period was an error . . . but that ellipsis wasn't.
37.7 +**************** EXPECTED ****************
37.8 +
37.9 +There are some complications . The extra space left around that
37.10 + Line 1 column 30 - Spaced punctuation?
38.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
38.2 +++ b/test/compatibility/example.tst Tue Jan 24 23:54:05 2012 +0000
38.3 @@ -0,0 +1,87 @@
38.4 +**************** INPUT ****************
38.5 +They saw him distinctly, as with the naked eye; a word, a turn of
38.6 +the pen, or a word unsaid, offered the picture of him in America,
38.7 +Japan, China, Australia , nay, the continent of Europe, holding an
38.8 +English review of his Maker's grotesques. Vernon seemed a
38.9 +sheepish fellow, without stature abroad, glad of a compliment
38.10 +, grateful for a dinner, endeavouring sadly to digest all he saw
38.11 +and heard. But one was a Patterne; tbe other a Whitford. One had
38.12 +genius; the other pottered after him to he a student. One was the
38.13 +English gent1eman wherever he went; the other was a new kind of
38.14 +thing, nondescript, produced in England of late, and not likely
38.15 +to come to much good himself, or do much good to the country.
38.16 +
38.17 +Vernon's dancing in America was capitally described by Willoughby.
38.18 +"Adieu to our cousins!" the latter wrote on his voyage to Japan.
38.19 +"I may possibly have had some vogue in their ball-rooms, and in
38.20 +showing them an English seat on horseback: 1 must resign myself if
38.21 +I have not been popular among them. I could not sing their
38.22 +national song--if a congery of states be a nation-- and I must
38.23 +confess I listened with frigid politeness to their singing of it.
38.24 +A great people, no doubt. Adieu to them. I have had to tear old
38.25 +Vernon away. He had serious thoughts of settling, means to
38.26 +
38.27 +correspond with some of them. On the whole, forgetting two or
38.28 +more "traits of insolence~ on the part of his hosts, which he
38.29 +cited, Willoughby escaped pretty comfortably. The President had
38.30 +been, consciously or not,uncivil, but one knew his origin! Upon
38.31 +these interjections, placable flicks of the lionly tail addressed
38.32 +to Britannia the Ruler, who expected him in some mildish way to
38.33 +lash terga cauda in retiring, Sir WilIoughby Patterne passed from
38.34 +a land of alien manners,; and ever after he spoke of America
38.35 +respectfully aud pensively, with a tail tucked in, as it were. His
38.36 +travels were profitable to himself. The fact is, that tbere are
38.37 +cousins who come to greatness and rnust be pacified, or they will
38.38 +prove annoying. Heaven forefend a collision between cousins!
38.39 +**************** EXPECTED ****************
38.40 +
38.41 +Japan, China, Australia , nay, the continent of Europe, holding an
38.42 + Line 3 column 25 - Spaced punctuation?
38.43 +
38.44 +, grateful for a dinner, endeavouring sadly to digest all he saw
38.45 + Line 6 column 1 - Begins with punctuation?
38.46 +
38.47 +and heard. But one was a Patterne; tbe other a Whitford. One had
38.48 + Line 7 column 34 - Query word tbe - not reporting duplicates
38.49 +
38.50 +genius; the other pottered after him to he a student. One was the
38.51 + Line 8 column 37 - Query he/be error?
38.52 +
38.53 +English gent1eman wherever he went; the other was a new kind of
38.54 + Line 9 column 8 - Query digit in gent1eman
38.55 +
38.56 +showing them an English seat on horseback: 1 must resign myself if
38.57 + Line 16 column 43 - Query standalone 1
38.58 +
38.59 +national song--if a congery of states be a nation-- and I must
38.60 + Line 18 column 50 - Spaced em-dash?
38.61 +
38.62 +Vernon away. He had serious thoughts of settling, means to
38.63 + Line 21 column 58 - No punctuation at para end?
38.64 +
38.65 +Vernon's dancing in America was capitally described by Willoughby.
38.66 + Line 22 - Mismatched quotes
38.67 +
38.68 +correspond with some of them. On the whole, forgetting two or
38.69 + Line 23 column 1 - Paragraph starts with lower-case
38.70 +
38.71 +more "traits of insolence~ on the part of his hosts, which he
38.72 + Line 24 column 26 - Tilde character?
38.73 +
38.74 +been, consciously or not,uncivil, but one knew his origin! Upon
38.75 + Line 26 column 25 - Missing space?
38.76 +
38.77 +lash terga cauda in retiring, Sir WilIoughby Patterne passed from
38.78 + Line 29 column 34 - Query word WilIoughby - not reporting duplicates
38.79 +
38.80 +a land of alien manners,; and ever after he spoke of America
38.81 + Line 30 column 24 - Double punctuation?
38.82 +
38.83 +respectfully aud pensively, with a tail tucked in, as it were. His
38.84 + Line 31 column 13 - Query word aud - not reporting duplicates
38.85 +
38.86 +travels were profitable to himself. The fact is, that tbere are
38.87 + Line 32 column 54 - Query word tbere - not reporting duplicates
38.88 +
38.89 +cousins who come to greatness and rnust be pacified, or they will
38.90 + Line 33 column 34 - Query word rnust - not reporting duplicates
39.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
39.2 +++ b/test/compatibility/extra-period.tst Tue Jan 24 23:54:05 2012 +0000
39.3 @@ -0,0 +1,6 @@
39.4 +**************** INPUT ****************
39.5 +"If you do. you'll regret it!"
39.6 +**************** EXPECTED ****************
39.7 +
39.8 +"If you do. you'll regret it!"
39.9 + Line 1 column 11 - Extra period?
40.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
40.2 +++ b/test/compatibility/he-be.tst Tue Jan 24 23:54:05 2012 +0000
40.3 @@ -0,0 +1,6 @@
40.4 +**************** INPUT ****************
40.5 +The horse is said to he worth a lot.
40.6 +**************** EXPECTED ****************
40.7 +
40.8 +The horse is said to he worth a lot.
40.9 + Line 1 column 18 - Query he/be error?
41.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
41.2 +++ b/test/compatibility/html-symbol.tst Tue Jan 24 23:54:05 2012 +0000
41.3 @@ -0,0 +1,6 @@
41.4 +**************** INPUT ****************
41.5 +&So;
41.6 +**************** EXPECTED ****************
41.7 +
41.8 +&So;
41.9 + Line 1 column 1 - HTML symbol? &So;
42.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
42.2 +++ b/test/compatibility/html-tag.tst Tue Jan 24 23:54:05 2012 +0000
42.3 @@ -0,0 +1,6 @@
42.4 +**************** INPUT ****************
42.5 +<This is a tag>
42.6 +**************** EXPECTED ****************
42.7 +
42.8 +<This is a tag>
42.9 + Line 1 column 1 - HTML Tag? <This is a tag>
43.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
43.2 +++ b/test/compatibility/mismatched-quotes.tst Tue Jan 24 23:54:05 2012 +0000
43.3 @@ -0,0 +1,8 @@
43.4 +**************** INPUT ****************
43.5 +Margaret said: "Now you should start for school.
43.6 +
43.7 +New paragraph.
43.8 +**************** EXPECTED ****************
43.9 +
43.10 +Margaret said: "Now you should start for school.
43.11 + Line 2 - Mismatched quotes
44.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
44.2 +++ b/test/compatibility/missing-space.tst Tue Jan 24 23:54:05 2012 +0000
44.3 @@ -0,0 +1,6 @@
44.4 +**************** INPUT ****************
44.5 +"Look!John, over there!"
44.6 +**************** EXPECTED ****************
44.7 +
44.8 +"Look!John, over there!"
44.9 + Line 1 column 6 - Missing space?
45.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
45.2 +++ b/test/compatibility/short-line.tst Tue Jan 24 23:54:05 2012 +0000
45.3 @@ -0,0 +1,15 @@
45.4 +**************** INPUT ****************
45.5 +The second line of a paragraph isn't usually short at all
45.6 +and
45.7 +should be flagged as a warning by gutcheck as long as there
45.8 +are sufficient numbers of lines in the file to stop it deciding
45.9 +that there are too many short lines to bother reporting, which
45.10 +means that I have to waffle on until we have at least 10 lines
45.11 +of text.
45.12 +
45.13 +The last line of a paragraph
45.14 +is usually short.
45.15 +**************** EXPECTED ****************
45.16 +
45.17 +and
45.18 + Line 2 column 3 - Short line 3?
46.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
46.2 +++ b/test/compatibility/spaced-doublequote.tst Tue Jan 24 23:54:05 2012 +0000
46.3 @@ -0,0 +1,9 @@
46.4 +**************** INPUT ****************
46.5 +Margaret said: " Now you should start for school."
46.6 +**************** EXPECTED ****************
46.7 +
46.8 +Margaret said: " Now you should start for school."
46.9 + Line 1 column 16 - Wrongspaced quotes?
46.10 +
46.11 +Margaret said: " Now you should start for school."
46.12 + Line 1 column 15 - Spaced doublequote?
47.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
47.2 +++ b/test/compatibility/spaced-punctuation.tst Tue Jan 24 23:54:05 2012 +0000
47.3 @@ -0,0 +1,6 @@
47.4 +**************** INPUT ****************
47.5 +"Look! John , over there!"
47.6 +**************** EXPECTED ****************
47.7 +
47.8 +"Look! John , over there!"
47.9 + Line 1 column 13 - Spaced punctuation?
48.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
48.2 +++ b/test/harness/Makefile.am Tue Jan 24 23:54:05 2012 +0000
48.3 @@ -0,0 +1,8 @@
48.4 +INCLUDES=-I$(top_srcdir)
48.5 +bin_PROGRAMS=gc-test
48.6 +AM_CFLAGS=$(GLIB_CFLAGS)
48.7 +LIBS=$(GLIB_LIBS)
48.8 +
48.9 +gc_test_SOURCES=gc-test.c testcase.c testcase.h testcaseio.c testcaseio.h \
48.10 + testcaseparser.c testcaseparser.h
48.11 +gc_test_LDADD=../../gclib/libgc.la
49.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
49.2 +++ b/test/harness/gc-test.c Tue Jan 24 23:54:05 2012 +0000
49.3 @@ -0,0 +1,31 @@
49.4 +#include <stdlib.h>
49.5 +#include <stdio.h>
49.6 +#include <string.h>
49.7 +#include <gclib/gclib.h>
49.8 +#include "testcase.h"
49.9 +#include "testcaseio.h"
49.10 +
49.11 +/*
49.12 + * Returns FALSE if the test should be considered to have failed.
49.13 + * (returns TRUE on pass or expected-fail).
49.14 + */
49.15 +boolean run_test(const char *filename)
49.16 +{
49.17 + Testcase *testcase;
49.18 + boolean retval;
49.19 + testcase=testcase_parse_file(filename);
49.20 + if (!testcase)
49.21 + return FALSE;
49.22 + retval=testcase_run(testcase);
49.23 + testcase_free(testcase);
49.24 + return retval;
49.25 +}
49.26 +
49.27 +int main(int argc,char **argv)
49.28 +{
49.29 + int i;
49.30 + boolean pass=TRUE;
49.31 + for(i=1;i<argc;i++)
49.32 + pass&=run_test(argv[i]);
49.33 + return pass?0:1;
49.34 +}
50.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
50.2 +++ b/test/harness/testcase.c Tue Jan 24 23:54:05 2012 +0000
50.3 @@ -0,0 +1,203 @@
50.4 +#include <stdlib.h>
50.5 +#include <stdio.h>
50.6 +#include <string.h>
50.7 +#include <unistd.h>
50.8 +#include <errno.h>
50.9 +#ifdef WIN32
50.10 +#include <io.h>
50.11 +#include <fcntl.h>
50.12 +#endif
50.13 +#include <gclib/gclib.h>
50.14 +#include "testcase.h"
50.15 +
50.16 +#if !HAVE_MKSTEMP
50.17 +/*
50.18 + * An insecure implementation of mkstemp(), for those platforms that
50.19 + * don't support it.
50.20 + */
50.21 +int mkstemp(char *template)
50.22 +{
50.23 + int fd;
50.24 + char *s;
50.25 + for(;;)
50.26 + {
50.27 + s=str_dup(template);
50.28 + mktemp(s);
50.29 + if (!*s)
50.30 + {
50.31 + errno=EEXIST;
50.32 + mem_free(s);
50.33 + return -1;
50.34 + }
50.35 + fd=open(s,O_RDWR|O_CREAT|O_EXCL,0600);
50.36 + if (fd>0)
50.37 + {
50.38 + strcpy(template,s);
50.39 + mem_free(s);
50.40 + return fd;
50.41 + }
50.42 + else
50.43 + mem_free(s);
50.44 + }
50.45 +}
50.46 +#endif /* !HAVE_MKSTEMP */
50.47 +
50.48 +/*
50.49 + * As write(), but always convert NL to CR NL.
50.50 + */
50.51 +static size_t write_text(int fd,const char *buf,size_t count)
50.52 +{
50.53 + size_t i;
50.54 + FILE *fp;
50.55 + fd=dup(fd);
50.56 + if (fd<0)
50.57 + return -1;
50.58 +#ifdef WIN32
50.59 + if (_setmode(fd,_O_BINARY)<0)
50.60 + {
50.61 + close(fd);
50.62 + return -1;
50.63 + }
50.64 +#endif
50.65 + fp=fdopen(fd,"wb");
50.66 + if (!fp)
50.67 + {
50.68 + close(fd);
50.69 + return -1;
50.70 + }
50.71 + for(i=0;i<count;i++)
50.72 + {
50.73 + if (buf[i]=='\n')
50.74 + if (putc('\r',fp)==EOF)
50.75 + {
50.76 + (void)fclose(fp);
50.77 + return -1;
50.78 + }
50.79 + if (putc(buf[i],fp)==EOF)
50.80 + {
50.81 + (void)fclose(fp);
50.82 + return -1;
50.83 + }
50.84 + }
50.85 + if (fclose(fp))
50.86 + return -1;
50.87 + return count;
50.88 +}
50.89 +
50.90 +/*
50.91 + * Return the length (in bytes) or any common prefix between s1 and s2.
50.92 + */
50.93 +size_t common_prefix_length(const char *s1,const char *s2)
50.94 +{
50.95 + size_t i;
50.96 + for(i=0;s1[i] && s2[i] && s1[i]==s2[i];i++)
50.97 + ;
50.98 + return i;
50.99 +}
50.100 +
50.101 +/*
50.102 + * Run a testcase, returning FALSE on fail or error and
50.103 + * TRUE on pass or expected-fail.
50.104 + * Suitable message(s) will be printed in all cases.
50.105 + */
50.106 +boolean testcase_run(Testcase *testcase)
50.107 +{
50.108 + boolean r;
50.109 + int fd,exit_status,col;
50.110 + size_t n,pos,offset,header_len;
50.111 + FILE *fp;
50.112 + char input[]="TEST-XXXXXX";
50.113 + char *endp,*bol;
50.114 + char *command[3];
50.115 + String *expected,*report;
50.116 + char *output;
50.117 + fd=mkstemp(input);
50.118 + if (testcase->input)
50.119 + n=strlen(testcase->input);
50.120 + else
50.121 + n=0;
50.122 + if (n && write_text(fd,testcase->input,n)!=n)
50.123 + {
50.124 + perror(input);
50.125 + close(fd);
50.126 + (void)remove(input);
50.127 + return FALSE;
50.128 + }
50.129 + close(fd);
50.130 + command[0]=getenv("GUTCHECK");
50.131 + if (!command[0])
50.132 + command[0]="." GC_DIR_SEPARATOR_S "gutcheck";
50.133 + command[1]=input;
50.134 + command[2]=NULL;
50.135 + if (testcase->expected)
50.136 + r=spawn_sync(command,&output,&exit_status);
50.137 + else
50.138 + {
50.139 + r=spawn_sync(command,NULL,&exit_status);
50.140 + output=NULL;
50.141 + }
50.142 + (void)remove(input);
50.143 + if (!r)
50.144 + return FALSE;
50.145 + if (testcase->expected)
50.146 + {
50.147 + expected=string_new("\n\nFile: ");
50.148 + string_append(expected,input);
50.149 + string_append(expected,"\n\n\n");
50.150 + header_len=expected->len;
50.151 + string_append(expected,testcase->expected);
50.152 + }
50.153 + else
50.154 + {
50.155 + expected=NULL;
50.156 + header_len=0;
50.157 + }
50.158 + if (expected && strcmp(output,expected->str))
50.159 + {
50.160 + fprintf(stderr,"%s: FAIL\n",testcase->basename);
50.161 + offset=common_prefix_length(output,expected->str);
50.162 + if (offset==header_len && !output[offset])
50.163 + fprintf(stderr,"Unexpected zero warnings from gutcheck.\n");
50.164 + else
50.165 + {
50.166 + endp=strchr(output+offset,'\n');
50.167 + if (!endp)
50.168 + endp=output+strlen(output);
50.169 + report=string_new(NULL);
50.170 + string_append_len(report,output,endp-output);
50.171 + bol=strrchr(report->str,'\n');
50.172 + if (bol)
50.173 + bol++;
50.174 + else
50.175 + bol=report->str;
50.176 + col=offset-(bol-report->str);
50.177 + fprintf(stderr,"Unexpected output from gutcheck:\n");
50.178 + if (report->len>=header_len)
50.179 + fprintf(stderr,"%s\n%*s^\n",report->str+header_len,col,"");
50.180 + else
50.181 + fprintf(stderr,"%s\n%*s^\n",report->str,col,"");
50.182 + string_free(report,TRUE);
50.183 + }
50.184 + string_free(expected,TRUE);
50.185 + mem_free(output);
50.186 + return FALSE;
50.187 + }
50.188 + string_free(expected,TRUE);
50.189 + mem_free(output);
50.190 + if (exit_status)
50.191 + fprintf(stderr,"gutcheck exited with code %d\n",r);
50.192 + if (!exit_status)
50.193 + fprintf(stderr,"%s: PASS\n",testcase->basename);
50.194 + return !exit_status;
50.195 +}
50.196 +
50.197 +/*
50.198 + * Free a testcase.
50.199 + */
50.200 +void testcase_free(Testcase *testcase)
50.201 +{
50.202 + mem_free(testcase->basename);
50.203 + mem_free(testcase->input);
50.204 + mem_free(testcase->expected);
50.205 + mem_free(testcase);
50.206 +}
51.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
51.2 +++ b/test/harness/testcase.h Tue Jan 24 23:54:05 2012 +0000
51.3 @@ -0,0 +1,16 @@
51.4 +#ifndef TESTCASE_H
51.5 +#define TESTCASE_H
51.6 +
51.7 +typedef struct {
51.8 + char *basename;
51.9 + char *input;
51.10 + char *expected;
51.11 + enum {
51.12 + TESTCASE_XFAIL=1<<0,
51.13 + } flags;
51.14 +} Testcase;
51.15 +
51.16 +boolean testcase_run(Testcase *testcase);
51.17 +void testcase_free(Testcase *testcase);
51.18 +
51.19 +#endif /* TESTCASE_H */
52.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
52.2 +++ b/test/harness/testcaseio.c Tue Jan 24 23:54:05 2012 +0000
52.3 @@ -0,0 +1,63 @@
52.4 +#include <stdlib.h>
52.5 +#include <stdio.h>
52.6 +#include <string.h>
52.7 +#include <gclib/gclib.h>
52.8 +#include "testcaseparser.h"
52.9 +#include "testcaseio.h"
52.10 +
52.11 +/*
52.12 + * Read a testcase in from a file.
52.13 + * On error, print a suitable message on stderr and return NULL.
52.14 + * The returned testcase should be freed with testcase_free().
52.15 + */
52.16 +Testcase *testcase_parse_file(const char *filename)
52.17 +{
52.18 + Testcase *testcase;
52.19 + TestcaseParser *parser;
52.20 + char *s;
52.21 + const char *tag,*text;
52.22 + boolean found_tag=FALSE;
52.23 + parser=testcase_parser_new_from_file(filename);
52.24 + if (!parser)
52.25 + return NULL;
52.26 + if (!*testcase_parser_get_flag(parser))
52.27 + {
52.28 + fprintf(stderr,"%s: Not a valid testcase (flag)\n",filename);
52.29 + testcase_parser_free(parser);
52.30 + return NULL;
52.31 + }
52.32 + testcase=mem_new0(Testcase,1);
52.33 + testcase->basename=path_get_basename(filename);
52.34 + s=strrchr(testcase->basename,'.');
52.35 + if (s)
52.36 + *s='\0';
52.37 + while(testcase_parser_get_next_tag(parser,&tag,&text))
52.38 + {
52.39 + if (!testcase->input && !strcmp(tag,"INPUT"))
52.40 + testcase->input=str_dup(text);
52.41 + else if (!testcase->expected && !strcmp(tag,"EXPECTED"))
52.42 + testcase->expected=str_dup(text);
52.43 + else
52.44 + {
52.45 + fprintf(stderr,"%s: Not a valid testcase (%s)\n",filename,tag);
52.46 + testcase_free(testcase);
52.47 + testcase_parser_free(parser);
52.48 + return NULL;
52.49 + }
52.50 + found_tag=TRUE;
52.51 + }
52.52 + if (!testcase_parser_at_eof(parser))
52.53 + {
52.54 + if (found_tag)
52.55 + fprintf(stderr,"%s: Not a valid testcase (garbage at end)\n",
52.56 + filename);
52.57 + else
52.58 + fprintf(stderr,"%s: Not a valid testcase (no valid tags)\n",
52.59 + filename);
52.60 + testcase_free(testcase);
52.61 + testcase_parser_free(parser);
52.62 + return NULL;
52.63 + }
52.64 + testcase_parser_free(parser);
52.65 + return testcase;
52.66 +}
53.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
53.2 +++ b/test/harness/testcaseio.h Tue Jan 24 23:54:05 2012 +0000
53.3 @@ -0,0 +1,8 @@
53.4 +#ifndef TESTCASE_IO_H
53.5 +#define TESTCASE_IO_H
53.6 +
53.7 +#include "testcase.h"
53.8 +
53.9 +Testcase *testcase_parse_file(const char *filename);
53.10 +
53.11 +#endif /* TESTCASE_IO_H */
54.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
54.2 +++ b/test/harness/testcaseparser.c Tue Jan 24 23:54:05 2012 +0000
54.3 @@ -0,0 +1,115 @@
54.4 +#include <stdlib.h>
54.5 +#include <stdio.h>
54.6 +#include <string.h>
54.7 +#include <ctype.h>
54.8 +#include <gclib/gclib.h>
54.9 +#include "testcaseparser.h"
54.10 +
54.11 +/*
54.12 + * Get the flag (the string of characters which bracket tags in test cases).
54.13 + */
54.14 +const char *testcase_parser_get_flag(TestcaseParser *parser)
54.15 +{
54.16 + char *s=parser->contents;
54.17 + if (!parser->flag)
54.18 + {
54.19 + parser->flag=string_new(NULL);
54.20 + while(*s>' ' && *s<='~')
54.21 + string_append_c(parser->flag,*s++);
54.22 + }
54.23 + return parser->flag->str;
54.24 +}
54.25 +
54.26 +/*
54.27 + * Test if the parser has reached the end of the input file
54.28 + */
54.29 +boolean testcase_parser_at_eof(TestcaseParser *parser)
54.30 +{
54.31 + return !parser->contents[parser->pos];
54.32 +}
54.33 +
54.34 +/*
54.35 + * Get the next tag (and its associated text, if any) from a test case.
54.36 + * Returns: TRUE if successful and FALSE if no more valid tags are present.
54.37 + * Callers can call testcase_parser_at_eof() when testcase_parser_get_next_tag()
54.38 + * to distinguish EOF and text which isn't a valid tag.
54.39 + */
54.40 +boolean testcase_parser_get_next_tag(TestcaseParser *parser,const char **tag,
54.41 + const char **text)
54.42 +{
54.43 + size_t n;
54.44 + char *eol,*endp;
54.45 + String *string;
54.46 + mem_free(parser->tag);
54.47 + parser->tag=NULL;
54.48 + mem_free(parser->tag_text);
54.49 + parser->tag_text=NULL;
54.50 + (void)testcase_parser_get_flag(parser);
54.51 + if (strncmp(parser->contents+parser->pos,parser->flag->str,
54.52 + parser->flag->len))
54.53 + return FALSE;
54.54 + eol=strchr(parser->contents+parser->pos,'\n');
54.55 + if (!eol)
54.56 + return FALSE;
54.57 + endp=eol-parser->flag->len;
54.58 + if (strncmp(endp,parser->flag->str,parser->flag->len))
54.59 + return FALSE;
54.60 + while(endp>parser->contents && isspace(endp[-1]))
54.61 + endp--;
54.62 + parser->pos+=parser->flag->len;
54.63 + while(isspace(parser->contents[parser->pos]))
54.64 + parser->pos++;
54.65 + parser->tag=str_ndup(parser->contents+parser->pos,
54.66 + endp-(parser->contents+parser->pos));
54.67 + parser->pos=eol-parser->contents+1;
54.68 + string=string_new(NULL);
54.69 + while (!testcase_parser_at_eof(parser) &&
54.70 + strncmp(parser->contents+parser->pos,parser->flag->str,parser->flag->len))
54.71 + {
54.72 + eol=strchr(parser->contents+parser->pos,'\n');
54.73 + if (eol)
54.74 + n=eol-(parser->contents+parser->pos)+1;
54.75 + else
54.76 + n=strlen(parser->contents+parser->pos);
54.77 + string_append_len(string,parser->contents+parser->pos,n);
54.78 + parser->pos+=n;
54.79 + }
54.80 + parser->tag_text=string_free(string,FALSE);
54.81 + if (!parser->tag_text)
54.82 + parser->tag_text=str_dup("");
54.83 + if (tag)
54.84 + *tag=parser->tag;
54.85 + if (text)
54.86 + *text=parser->tag_text;
54.87 + return TRUE;
54.88 +}
54.89 +
54.90 +/*
54.91 + * Create a testcase parser to read a regular file.
54.92 + */
54.93 +TestcaseParser *testcase_parser_new_from_file(const char *filename)
54.94 +{
54.95 + TestcaseParser *parser;
54.96 + parser=mem_new0(TestcaseParser,1);
54.97 + if (!file_get_contents_text(filename,&parser->contents,NULL))
54.98 + {
54.99 + mem_free(parser);
54.100 + return NULL;
54.101 + }
54.102 + parser->filename=str_dup(filename);
54.103 + return parser;
54.104 +}
54.105 +
54.106 +/*
54.107 + * Free a testcase parser.
54.108 + */
54.109 +void testcase_parser_free(TestcaseParser *parser)
54.110 +{
54.111 + mem_free(parser->filename);
54.112 + mem_free(parser->contents);
54.113 + if (parser->flag)
54.114 + string_free(parser->flag,TRUE);
54.115 + mem_free(parser->tag);
54.116 + mem_free(parser->tag_text);
54.117 + mem_free(parser);
54.118 +}
55.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
55.2 +++ b/test/harness/testcaseparser.h Tue Jan 24 23:54:05 2012 +0000
55.3 @@ -0,0 +1,22 @@
55.4 +#ifndef TESTCASE_PARSER_H
55.5 +#define TESTCASE_PARSER_H
55.6 +
55.7 +#include <gclib/gclib.h>
55.8 +
55.9 +typedef struct {
55.10 + char *filename;
55.11 + char *contents;
55.12 + String *flag;
55.13 + size_t pos;
55.14 + char *tag;
55.15 + char *tag_text;
55.16 +} TestcaseParser;
55.17 +
55.18 +const char *testcase_parser_get_flag(TestcaseParser *parser);
55.19 +boolean testcase_parser_get_next_tag(TestcaseParser *parser,const char **tag,
55.20 + const char **text);
55.21 +boolean testcase_parser_at_eof(TestcaseParser *parser);
55.22 +TestcaseParser *testcase_parser_new_from_file(const char *filename);
55.23 +void testcase_parser_free(TestcaseParser *parser);
55.24 +
55.25 +#endif /* TESTCASE_PARSER_H */