summaryrefslogtreecommitdiff
path: root/examples/Tutorial.org
blob: 730891a774138b0c15ce0abf487d73fae023f64d (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940

  Table of Contents

  1. [[#concepts--conventions][Concepts & Conventions]]
     - [[#parsers][Parsers]]
     - [[#on-the-terms-accept-succeed-fail-and-result][On the Terms "Accept", "Succeed", "Fail" and "Result"]]
     - [[#naming-conventions][Naming Conventions]]
  2. [[#getting-started-finally][Getting Started (Finally)]]
  3. [[#jumping-in-with-whitespace][Jumping in with Whitespace]]
  4. [[#mapping-results-to-true-false-and-null][Mapping Results to true, false, and null]]
  5. [[#the-fundamental-bind-parsing-numbers][The Fundamental <<bind, Parsing Numbers]]
  6. [[#bracketed-sequences-with-strings][Bracketed Sequences with Strings]]
  7. [[#recursive-parsers][Recursive Parsers]]
     - [[#parsing-json-files][Parsing JSON Files]]
     - [[#problems-to-puzzle-out][Problems to Puzzle Out]]
  8. [[#conclusion][Conclusion]]
     - [[#code-listing][code listing]]

* The Goofism Guide to PARZIVAL

  _A Gentle Introduction_

  This tutorial will guide you through the process of creating a
  moderately sophisticated parser using [[https://github.com/cbeo/parzival][parzival]], a [[https://common-lisp.net/][Common Lisp]]
  library for writing stream parsers using the [[https://en.wikipedia.org/wiki/Parser_combinator]["parser combinator"]]
  approach.

  To motivate your learning, you will be building up a parser for the
  familiar JSON format, so , take a minute to scroll all the way
  through the [[https://www.json.org/json-en.html][definition document]].

  Notice the document's structure. At the top is a the definition of a
  JSON object. That definition refers to other terms, like arrays and
  strings, the definitions of which appear below. And those terms
  refer to still simpler terms, all the way to the bottom of the
  document where the term for whitespace is defined.

  As you work through this tutorial, you will work *up* the JSON
  definition. That is, you will begin at the very bottom, with
  whitespace. From there you will define parsers for increasingly
  complex terms, all the way up to the top where the JSON object
  appears.

  Enough chit-chat. Time to get going.

** Concepts & Conventions

   You should first understand a few concepts and conventions that
   will come up in the rest of the tutorial.

*** Parsers

    In =parzival= a parser is a function that accepts a character
    stream and returns three values:

    1. A result value, which can be anything.
    2. A success indicator, which is ~T~ if the parse succeeded, or
       ~NIL~ otherwise.
    3. The stream itself, which can be passed to further parsers or can
       be examined if the parse failed.


*** On the Terms "Accept", "Succeed", "Fail" and "Result"

    Some of the terms used to talk about parsing can be perhaps
    confused or conflated with terms used to talk about
    functions. This is especially the case in =parzival= because a
    parser *is* just a function.

    When parsing an input stream, the parser is said to "accept" the
    input when the parse "succeeds" with a "result".  Otherwise the
    parser is said to "fail" to accept the input it was given.

    I.e. On the one hand, you may be said to *call* a *function* with
    *arguments* so that it *returns* a value. On the other hand, a
    *parser* will *accept* *input* and either *result* in a value or
    *fail*.

    It may seem like nitpicking, but these terms are used frequently
    in =parzival='s documentation and in this tutorial.  It is my hope
    that explicit mention of the terms here will make the tutorial
    easier to read and understand.

*** Naming Conventions

    The =parzival= package exports a number of tragically un-lispy
    looking symbols. You'll see things like =<<bind= and =<alphanum<=
    and even =<<?=. But do not despair! There is a method to this
    madness.

    There are two sorts of value that are important in =parzival=.
    The first are the parsers themselves, which are discussed
    above. The second are higher-order functions that operate on,
    transform, and create new parsers; i.e. the "combinators".

    Parzival adopts two simple naming conventions.

    1. Symbols that look like =<moo<= refer to parsers.
    2. Symbols that look like =<<moo= refer to combinators.

    As an example:

#+BEGIN_SRC lisp

(<<def <article<
       (<<plus (<<string "a") (<<string "the"))
 "A parser that accepts a lowercase English article.".)

#+END_SRC

   Without knowing the exact meaning of the above code, you can
   quickly see that =<article<= is a parser and it is defined using the
   combinators =<<plus=, and =<<string=.

   The =<<def= form is actually a macro that is used to define
   parsers.

** Getting Started (Finally)

   As of April 2020 =parzival= is not in quicklisp, hence you must
   manually install some things first.

   If quicklisp is installed in your local home directory, do:

   : cd $HOME/quicklisp/local-projects
   : git clone https://github.com/cbeo/replay-streams
   : git clone https://github.com/cbeo/parzival

   Now you should be able to fire up a lisp and do

#+BEGIN_SRC lisp

(ql:quickload :parzival)

;; and for convenience, go ahead and define a package:

(defpackage :parzival-json
   (:use :cl :parzival)
   (:nicknames :pz-json))

(in-package :parzival-json)

#+END_SRC

  Time for your first parser!

** Jumping in with Whitespace

   Looking at the JSON document, you see that =whitespace= is any of
   the characters =space=, =linefeed=, =carriage return=, or
   =horizontal tab=, repeated zero or more times.

   This translates fairly directly to a parser definition:

#+BEGIN_SRC lisp

(<<def <ws<                    ; define parsers with <<def
       (<<*                    ; zero or more, the Kleene star, reminiscent of regex
         (<<or                 ; any of the the following
           (<<char #\Space)    ; parse exactly one character
           (<<char #\Linefeed)
           (<<char #\Return)
           (<<char #\Tab))))

#+END_SRC

And you can test this out in the repl:

#+BEGIN_SRC lisp

PZ-JSON> (let ((string "   "))
           (parse string <ws< t))
(#\  #\  #\ )
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {100884A713}>

PZ-JSON> (let ((string "
         "))
           (parse string <ws< t))
(#\  #\  #\  #\Newline #\  #\  #\  #\  #\  #\  #\  #\  #\ )
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {10088055F3}>

PZ-JSON>

#+END_SRC

So what is going on?  The combinators =<<char=, =<<or=, and =<<*= all
create parsers.  The expression =(<<char #\Space)=, for example,
creates a parser that accepts exactly one space character.  This
parser also happens to result in exactly the space character.

The =<<or= combinator is called on any number of parsers as arguments
and returns a new parser.  The new parser will accept any of the
inputs that =<<or='s arguments accept.  So in the above, you get a
parser that accepts any one of the whitespace characters. It works by
trying to parse with each one of its arguments in order. When a parse
fails, the stream is rewound to where it was before the parse started,
and the next parser is tried. When the end of the list is reached
without a successful parse, the whole thing fails.

Finally the =<<*= combinator is named for the [[https://en.wikipedia.org/wiki/Kleene_star][Kleene star]].  It takes a
single parser as an argument and returns a parser that will,
effectively, accept the same input zero or more times, resulting in a
list of the results from the inner parser.

If the above definition is perhaps more verbose than you would like,
you could have instead used =<<any-char=, which takes a string as an
argument and returns a parser that accepts any character in the
string.

#+BEGIN_SRC lisp

(<<def <ws<
       (<<* (<<any-char (concatenate 'string '(#\Space #\Linefeed #\Return #\Tab)))))

#+END_SRC

** Mapping Results to true, false, and null

Before moving on to parsing numbers, it will be instructive to first
write parsers for the JSON values =true=, =false=, and =null=.

Here you will make use of the =<<string= and =<<map= combinators, both
of which are used frequently.

The =<<string= combinator creates a parser that will accept exactly
the string it was passed as its argument. Upon success, the defined
parser will result in that very same string.

An example should make this clear:

#+BEGIN_SRC lisp
PZ-JSON> (parse "hey dude" (<<string "hey") t)
"hey"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1008A071C3}>
#+END_SRC

The parser =(<<string "hey")= accepted exactly the string =hey= from
the input =hey dude= and resulted in the string =hey=.

Notice that if you try to accept the string =dude= from the same
initial input, the parse will fail:

#+BEGIN_SRC lisp
PZ-JSON> (parse "hey dude" (<<string "dude") t)
NIL
NIL
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1008A42BA3}>
#+END_SRC

The parse resulted in failure (indicated by a second return value of
=NIL=) because, though /dude/ appeared in the input, it was not at
the beginning of the stream.

At this point it seems clear that you will will want to define parsers
that look something like this:

#+BEGIN_SRC lisp
(<<def <true< (<<string "true"))
(<<def <false< (<<string "false"))
(<<def <null< (<<string "null"))
#+END_SRC

However, while each of the above will accept the right inputs, they
all result in strings, which probably isn't what you want. That is,
=true= should probably result in =T=, =false= in =NIL=, and
=null= in.. hmm that's a tough one: perhaps a keyword called =:NULL=.

This is where =<<map= comes in.

The =<<map= combinator accepts two arguments: a function =F= and a
parser =P=.  If =P= would result in value =R=, then =(<<map F P)=
returns a parser that accepts the same inputs as =P= but results in
the value of =(funcall F R)=.

If the above word salad is just too bonkers to be of use, an example
should be much clearer:

#+BEGIN_SRC lisp
PZ-JSON> (parse "hey dude" (<<map #'string-upcase (<<string "hey")) t)
"HEY"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1008C70623}>
PZ-JSON>
#+END_SRC

Ah! Much easier to understand. You just apply =#'string-upcase= to
the result of =(<<string "hey")=.

Writing the parsers for booleans and null values should now be easy:

#+BEGIN_SRC lisp

(<<def <true< (<<map (lambda (true) t) (<<string "true")))
(<<def <false< (<<map (lambda (false) nil) (<<string "false")))
(<<def <null< (<<map (lambda (null) :null) (<<string "null")))

#+END_SRC

Compiling the above and trying them out in the REPL you get, for example:

#+BEGIN_SRC lisp
; compilation unit finished
;   caught 1 STYLE-WARNING condition
; in: <<DEF <NULL<
;     (LAMBDA (NULL) :NULL)
; ==>
;   #'(LAMBDA (NULL) :NULL)
;
; caught STYLE-WARNING:
;   The variable NULL is defined but never used.
;
; compilation unit finished
;   caught 1 STYLE-WARNING condition
PZ-JSON> (parse "null" <null< t)
:NULL
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1008E204E3}>

#+END_SRC

Hmm everything works, but the compiler isn't happy. It is reporting a
warning that a variable is being defined but not used.  You could get
rid of this by doing something like, for example =(declare (ignore
null))=, for each of the above parser definitions, but it isn't
necessary: =parzival= supplies a mapping variant called =<<map-to=.
If you re-define the above parsers with =<<map-to=, the compiler warnings will go away:

#+BEGIN_SRC lisp
(<<def <true< (<<map-to t (<<string "true")))
(<<def <false< (<<map-to nil (<<string "false")))
(<<def <null< (<<map-to :null (<<string "null")))
#+END_SRC

=<<map-to= is convenient when you don't care about *what* was accepted
from the input, just that a parser did indeed succeed.  You can return
a literal value upon success.

** The Fundamental <<bind, Parsing Numbers

   Luckily, =parzival= includes two parsers that will get you most of
   the way to parsing JSON numbers. They are =<int<= and =<real<=,
   which parse integers and floating point numbers respectively. What
   =<real<= does not do, however, is parse exponential components of
   number strings.  I.e. It will correctly accept "-22.34" but not
   "-22.34E+33".

   To get the rest of the way, you will need to make use of three new
   combinators: =<<bind=, =<<?=, and =<<and=.

   First, =<<and= is analogous to Lisp's =and=, but works on parsers
   instead of values.  I.e. =(<<and <p1< <p2< ... <pn<)= will fail if
   any of its arguments fail, or will succeed if they all succeed,
   resulting in the result of its last argument, =<pn<=.

   Next, =<<?= is a combinator that makes an optional version of a
   parser. That is, a parser that will always succeed, even if it
   accepts no input.

   For example, in

   #+BEGIN_SRC lisp

PZ-JSON> (parse "abcd" (<<? (<<string "ab")) t)
"ab"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1009078143}>
PZ-JSON> (parse "abcd" (<<? (<<string "XXXab")) t)
NIL
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1009079863}>
PZ-JSON>

   #+END_SRC

   Both parses succeed, indicated by the =T= as the second return
   value, but the second parse would have failed if it were not made
   optional using =<<?=.  An optional parser will *rewind the stream*,
   leaving it the way it was before the parse was attempted.  You will
   see further examples of stream rewinding parsers below.

   Finally, =<<bind= is probably the most fundamental combinator in
   =parzival=. With =<<bind=, you can combine parsers together, making
   use of intermediate results to make decisions mid-parse about how
   to parse forthcoming input. Here is an illustrative example:

#+BEGIN_SRC lisp

PZ-JSON> (<<def <bind-test<
           (let ((vars '(#\a 10 #\b 20 #\c 30)))                ; the parser closes over vars
             (<<bind (<<and (<<char #\?)  <item<)               ; <item< accepts any character
                     (lambda (var)                              ; the result is bound to var
                       (let ((val (getf vars  var)))
                         (if val                                ; either return a new parser
                             (<<map (lambda (num) (* val num))  ; that results in a number
                                    (<<and <whitespace< <int<))
                             <fail<))))))                       ; or fail

#<CLOSURE (LAMBDA (STREAM) :IN <<BIND) {1009E390DB}>
PZ-JSON> (parse "?a 7" <bind-test< t)
70
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1009E3A673}>
PZ-JSON> (parse "?z 7" <bind-test< t)
NIL
NIL
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1009E3C3E3}>
PZ-JSON>

#+END_SRC

   What is going on here?  The above example, while illustrative, is
   perhaps a bit hard to look at.  Stay strong - relief will soon be
   found when =<<let= is introduced in another section! For now,
   concentrate on =<<bind=.

   The syntax for =<<bind= looks like this:

   =(<<bind PARSER FUNCTION)=

  Where =FUNCTION= is a function of one argument that is expected to
  return a parser.

  So in the above, the parser you are binding is

  =(<<and (<<char #\?) <item<)=

  which parses any two character sequence that starts with a question
  mark, resulting in whatever character followed the question mark in
  the input.

  The result of the above becomes bound to the =var= argument in the
  =lambda= appearing as the second argument to =<<bind=. This function
  looks up the value of =var= in the plist called =vars=.  If the
  value is found, a new parser is returned that accepts whitespace
  followed by an integer, it then results in multiplying the looked up
  value by the parsed integer.  If no value in =vars= corresponded to
  =var=, then the function returns the =<fail<= parser, which fails on
  all input.

  You could perhaps clarify the above definition with some
  intermediate parsers:

#+BEGIN_SRC

(<<def <bind-test<
       (let* ((vars      '(#\a 10 #\b 20 #\c 30))
              (<var<     (<<and (<<char #\?) <item<))
              (<sep-int< (<<and <whitespace< <int<))
              (transform (lambda (var)
                           (if (getf vars var)
                               (<<map (lambda (num) (* num (getf vars var)))
                                      <sep-int<)
                               <fail<))))

         (<<bind <var< transform)))

#+END_SRC

Enough palaver. Time for you to define your number parser. Looking
back at the diagram in the JSON definition document, you see that
numbers are made up of up to four parts: a sign, a whole part, a
fractional part, and an exponent part.  For the first three parts you
are in luck because =parzival= provides =<real<=.  So you need only
concentrate on the exponential part.  That is a good place to start.

The exponential part is a case insensitive character =#\e= followed by
a an optional sign symbol and then an integer.

#+BEGIN_SRC lisp

(<<def <number-exp-part<
       (<<and (<<any-char "eE")    ; case insensitive #\e
              (<<? (<<char #\+))   ; optional + sign
              <int<))              ; an integer

#+END_SRC

You may be wondering why you only need to make the =#\+= character
optional, and not include a parser for the the =#\-= sign too.  The
reason is pretty unexciting: the =<int<= parser already
optionally accepts a minus sign because it parses negative integers as
well as positives.

Next, you use =<<bind= to decide whether or not to scale the order of
magnitude of an already parsed real number:

#+BEGIN_SRC lisp
(<<def <number<
       (<<bind <real<
               (lambda (real)
                 (<<map (lambda (exp?)
                          (if exp? (* real (expt 10 exp?))
                              real))
                        (<<? <number-exp-part<)))))
#+END_SRC

The above parser does the following:

1. Parses a real number with =<real<= 
2. binds a successful result to the variable =real= inside the =lambda= expression.
3. Optionally parses an exponential part using =(<<? <number-exp-part<)=
4. binds the result of =<number-exp-part<= to the variable =exp?=,
   which, if non-null, results in =real= scaled by =(expt 10
   exp?)=.

You can now test it out in the REPL:

#+BEGIN_SRC lisp

PZ-JSON> (parse "-234.443e-4" <number< t)

-0.023444299
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1007D378F3}>
PZ-JSON> (parse "-234.443e4" <number< t)
-2344430.0
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1007D3A4B3}>
PZ-JSON> (parse "4.443E+3" <number< t)
4443.0
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1007E14943}>
PZ-JSON> (parse "0.443E+3" <number< t)
443.0
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1007E170D3}>
PZ-JSON> (parse "00001.443E+3" <number< t)
1443.0
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1007E29873}>
PZ-JSON>

#+END_SRC

In the very last REPL example, you see that =<number<= is actually
slightly wrong! The JSON definition only permits an initial =0= if the
number has no whole part.  That is, a correctly implemented =<number<=
should reject the string "00001.443E+3".  I'll leave that as an
exercise to the reader ;) .

A short note. =<<let= is a stunningly convenient macro that uses
=<<bind= under the hood. Here is the above =<number<= parser defined
using =<<let=.

#+BEGIN_SRC lisp

(<<def <number<
       (<<let ((real <real<)
               (exp? (<<? <number-exp-part<)))
          (<<result (* real (if exp? (expt 10 exp?) 1)))))

#+END_SRC

=<<let= defines a parser by binding intermediate results to variables
and then letting you make use of those bindings in an expression that
returns a new parser.

The =<<result= parser in the above accepts no input and results in its
argument. E.g. =(<<result 10)= would succeed, having accepted no
input, resulting in the number 10. It is handy inside =<<let= bodies,
but but is used in many surprising places.

** Bracketed Sequences with Strings

With strings, things start to get whacky.  The basic structure of a
JSON string is that of a sequence of zero or more characters
surrounded by quotation marks. Included in =parzival= are two
combinators called =<<brackets= and =<<char-brackets=.  Both are for
dealing with demarcated input. I.e. When you want to get =TARGET= out
of something that looks like =LEFT TARGET RIGHT=, then you use a
bracket combinator.

Getting hypothetical for a moment, you can already tell that the string
parser will look something like:

#+BEGIN_SRC lisp
;; incomplete sketch
(<<char-brackets #\" (<<* <string-char<) \")
#+END_SRC

I.e. a sequence of zero or more valid characters, bracketed by
quotation marks.

The above is close, but it isn't quite right. The =<<*= combinator
results in a *list* of matched values, but what you actually want is a
*string*.  Hence, your old friend =<<map=:

#+BEGIN_SRC lisp
(<<def <string<
       (<<map (lambda (accepted) (apply #'concatenate 'string accepted))
              (<<char-brackets  #\"  (<<* <string-char<)  #\")))
#+END_SRC

Now things get hairy. The definition of =<string-char<= is slightly
more complicated than you might think it should be because of *escape
sequences*: some characters in a valid JSON string are denoted by a
sequence that looks like =BACKSLASH CHARACTER=, and others by a
sequence like =BACKSLASH U HEX HEX HEX HEX=.

Feel free to study the definition of =<string<= in detail on your
own. The only new combinators it uses are =<<plus=, =<<times=, and
=<<sat=.

The =<<plus= combinator is a two argument version of =<<or=. Actually
=<<or= is defined in terms of =<<plus=.

The =<<times= combinator takes a number =N= and a parser =P= and results
in a list of exactly =N= results =P=.  E.g.

#+BEGIN_SRC lisp
PZ-JSON> (parse "aaba" (<<times 2 (<<char #\a)) t)
(#\a #\a)
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1005090833}>
PZ-JSON> (parse "aaba" (<<times 3 (<<char #\a)) t)
NIL
NIL
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {10051025A3}>
#+END_SRC

And the =<<sat= combinator accepts a single character, subject to a
predicate. If the predicate returns =NIL=, the parser fails.

So here is the code defining the =<string<= parser:

#+BEGIN_SRC lisp

   (defun codes-to-char (codes)
     "Accepts a list of characters, each one representing a hex
     digit. Returns a list containing a single character represented by
     those digits"
     (list (code-char (read-from-string
                       (concatenate 'string "#x" codes)))))

   ;; parses a single escaped sequence, either a slash and an escape
   ;; code, or a slash and four hex digits.  results in a list that
   ;; contains one character
   (<<def <escaped-char<
          (let* ((escapes '(#\b #\Backspace   ; a lookup table for character replacement
                            #\f #\Formfeed
                            #\n #\Linefeed
                            #\r #\Return
                            #\t #\Tab
                            #\" #\"
                            #\\ #\\
                            #\/ #\/)))
            (<<and (<<char #\\)              ; escapes start with a \
                    (<<or (<<bind <item<
                                  (lambda (c) (if (getf escapes c)
                                                  (<<result (list (getf escapes c))) ;; need a list
                                                  <fail<)))
                          (<<and (<<char #\u)
                                 (<<map #'codes-to-char
                                        (<<times 4 (<<any-char "0123456789aAbBcCdDeEfF"))))))))

  ;; a string-char is either an escaped char or any char that is neither
  ;; a quote nor a slash
   (<<def <string-char<
          (<<plus <escaped-char<
                  (<<map #'list (<<sat (lambda (c) (not (member c '(#\" #\\))))))))


   (<<def <string<
          (<<map (lambda (accepted) (apply #'concatenate 'string accepted))
                 (<<char-brackets
                  #\"
                  (<<* <string-char<)
                  #\")))


#+END_SRC

And here you can see it in action. Its a little cumbersome to test in
the REPL because you have to escape both the quotes and the the escapes:

#+BEGIN_SRC lisp
PZ-JSON> (parse "\"ab\\u6211cd moo \\n\"" <string< t)
"ab我cd moo
"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {100530E183}>
PZ-JSON> (parse "\"ab\\u0123Fcd\"" <string< t)
"abģFcd"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1005340AA3}>
PZ-JSON> (parse "\"they call me Colin \\\"Parse Master\\\" Okay\"" <string< t)
"they call me Colin \"Parse Master\" Okay"
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {10055BDF23}>
PZ-JSON>
#+END_SRC


** Recursive Parsers

You're in the home stretch! You've defined parsers for all of the
primitive value types, and now only the complex types remain.  And
here is where you encounter a new and interesting challenge.

Looking at the JSON definition, you notice two things.

1) =value=, representing any valid JSON value, is define din terms of
   =object= and  =array=.
2) But =object= and =array= are both defined in terms of =value=.

That's right! It's time for recursive parser definitions.

So, without having defined =<object<= or =<array<=, you can still go
ahead and define =<value<=.

#+BEGIN_SRC lisp
  ;; not strictly necessary, define these to keep the compiler from
  ;; complaining, and so that you can test things out in the REPL as you
  ;; go.
  (<<def <array< <fail<)
  (<<def <object< <fail<)

  (<<def <value<
         (<<or <object< <array< <string< <number< <true< <false< <null<))

#+END_SRC

Now the task is to define =<array<=. An array is just a bracketed list
of zero or more values, separated by commas and whitespace.  You
already know about brackets, and =parzival= provides combinators
called =<<strip=, for stripping whitespace, and =<<sep-by*= for the
rest.  Here's how it looks:

#+BEGIN_SRC lisp

;; results in a list
(<<def <array<
       (<<char-brackets  #\[
                         (<<sep-by* (<<strip  <value<) (<<char #\,))
                         #\]))

#+END_SRC

And finally, =<object<=.  An object is a sequence of zero or more
=STRING : VALUE= pairs, separated by commas and whitespace, and
bracketed by curly braces.  Again, pretty straightforward:

#+BEGIN_SRC lisp
(<<def <object-pair<
       (<<let ((prop <string<)
               (value (<<and <ws<
                             (<<char #\:)
                             <ws<
                             <value<)))
              (<<result (cons prop value))))

;; results in an association list
(<<def <object<
       (<<char-brackets #\{
                        (<<sep-by* (<<strip <object-pair<) (<<char #\,))
                        #\}))

#+END_SRC

Glorious! Try it out, go wild! (And please, pester [[http://github.com/cbeo][cbeo]] with bugs.)

#+BEGIN_SRC lisp

PZ-JSON> (parse "{\"a\" : 10 , \"b\" : 3 }" <value< t)
(("a" . 10) ("b" . 3))
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {100334FA63}>
PZ-JSON> (parse "{ \"name\" : \"colin\",
\"hobbies\" : [\"lisp\"  , \"parsing\"  ]   ,
\"features\" : { \"head\" : \"round\", \"eyes\" : 2} }" <value< t)
(("name" . "colin") ("hobbies" "lisp" "parsing")
 ("features" ("head" . "round") ("eyes" . 2)))
T
#<REPLAY-STREAMS:STATIC-TEXT-REPLAY-STREAM {1003380733}>
PZ-JSON>

#+END_SRC

*** Parsing JSON Files

 Here is how you would parse some JSON from a file:

 #+BEGIN_SRC lisp

 PZ-JSON> (with-open-file (file-input "examples/foo.json")
            (let ((rp-stream (make-instance 'replay-streams:character-input-replay-stream
                                            :source file-input)))
              (parse rp-stream <value<)))
 ((("name" . "Boutade")
   ("languages"
    (("lang" . "Common Lisp") ("proficiency" . :NULL) ("lovesIt" . T))
    (("lang" . "Rust") ("proficiency" . 0.8) ("lovesIt" . T)
     ("isAshamedToLoveIt" . T))
    (("lang" . "Haskell") ("proficiency" . 0.5)
     ("lovesIt" . "sometimes, in some ways")))
   ("pizzaOrder" "Tempeh Italian Sausage" "Spinach" "Mushrooms"
    "Red Pepper Flakes")
   ("isCool") ("isFunny") ("thinksPeopleAreLaughing" . T)
   ("beHonest_thinksPeopleAreLaughing"))
  (("name" . "Goofist")
   ("languages"
    (("lang" . "Common Lisp") ("proficiency" "over" 9000) ("lovesIt" . T))
    (("lang" . "Rust") ("proficiency" . -1) ("lovesIt" . T)
     ("isAshamedToLoveIt" . T))
    (("lang" . "Haskell") ("proficiency" . -1)
     ("lovesIt" . "i cannot tell a lie")))
   ("pizzaOrder" "Blue Stilton" "Walnuts" "Pork Sausage" "Apple Slices")
   ("isCool" . T) ("isFunny" . T) ("thinksPeopleAreLaughing" . T)
   ("beHonest_thinksPeopleAreLaughing" . T)))
 T
 #<REPLAY-STREAMS:CHARACTER-INPUT-REPLAY-STREAM source-head: 1485, head: 1485>
 PZ-JSON>

 #+END_SRC

 For the moment, parsers only work on instances of [[https://github.com/cbeo/replay-streams][replay-streams]]. If
 you pass raw text to the =parse= function for its =STREAM= argument,
 then you must also pass a =T= into its third optional argument
 position. Otherwise the stream is assumed to be a =replay-stream=.

*** Problems to Puzzle Out

1. Association Lists may or may not be the most appropriate data
   structure for the representation of JSON objects. How could you
   change the =<object<= definition to make something more
   convenient. E.g. plists perhaps?

2. As noted above, the =<number<= parser actually parses some numbers
   that are not technically valid JSON values. Specifically, valid
   JSON numbers may start with at most one =0=.  How would you change
   =<number<= to correct for this?

3. Perhaps Lists are not the right structure for JS arrays.  Maybe you
   should change =<array<= to result in Common Lisp arrays?

** Conclusion

I hope you have had a good time learning about how this parser
combinator library works.  Go forth and parse!

signing off.
cbeo.

*** code listing

 For your convenience, the complete code listing follow

#+BEGIN_SRC lisp
(defpackage :parzival-json
   (:use :cl :parzival)
   (:nicknames :pz-json))

(in-package :parzival-json)

(<<def <ws<
       (<<* (<<any-char (concatenate 'string '(#\Space #\Linefeed #\Return #\Tab)))))

(<<def <true< (<<map-to t (<<string "true")))
(<<def <false< (<<map-to nil (<<string "false")))
(<<def <null< (<<map-to :null (<<string "null")))

(<<def <number-exp-part<
       (<<and (<<any-char "eE")
              (<<? (<<char #\+))
              <int<))


(<<def <number<
       (<<let ((real <real<)
               (exp? (<<? <number-exp-part<)))
              (<<result (* real (if exp? (expt 10 exp?) 1)))))


(defun codes-to-char (codes)
  "Accepts a list of characters, each one representing a hex
  digit. Returns a list containing a single character represented by
  those digits"
  (list (code-char (read-from-string
                    (concatenate 'string "#x" codes)))))

(<<def <escaped-char<
       (let* ((escapes '(#\b #\Backspace   ; a lookup table for character replacement
                         #\f #\Formfeed
                         #\n #\Linefeed
                         #\r #\Return
                         #\t #\Tab
                         #\" #\"
                         #\\ #\\
                         #\/ #\/)))
         (<<and (<<char #\\)              ; escapes start with a \
                 (<<or (<<bind <item<
                               (lambda (c) (if (getf escapes c)
                                               (<<result (list (getf escapes c))) ;; need a list
                                               <fail<)))
                       (<<and (<<char #\u)
                              (<<map #'codes-to-char
                                     (<<times 4 (<<any-char "0123456789aAbBcCdDeEfF"))))))))

(<<def <string-char<
       ;; either an escaped char or any char that is neither a quote nor an escape
       (<<plus <escaped-char<
               (<<map #'list (<<sat (lambda (c) (not (member c '(#\" #\\))))))))


(<<def <string<
       (<<map (lambda (accepted) (apply #'concatenate 'string accepted))
              (<<char-brackets
               #\"
               (<<* <string-char<)
               #\")))

(<<def <array< <fail<)
(<<def <object< <fail<)

(<<def <value<
       (<<or <object< <array< <string< <number< <true< <false< <null<))

(<<def <array<
       (<<char-brackets  #\[
                         (<<sep-by* (<<strip  <value<) (<<char #\,))
                         #\]))

(<<def <object-pair<
       (<<let ((prop <string<)
               (value (<<and <ws<
                             (<<char #\:)
                             <ws<
                             <value<)))
              (<<result (cons prop value))))

(<<def <object<
       (<<char-brackets #\{
                        (<<sep-by* (<<strip <object-pair<) (<<char #\,))
                        #\}))


#+END_SRC