34
35
36:- module(html_text,
37 [ html_text/1, 38 html_text/2 39 ]). 40:- autoload(library(ansi_term),[ansi_format/3]). 41:- autoload(library(apply),[foldl/4,maplist/3,maplist/2]). 42:- autoload(library(debug),[debug/3]). 43:- autoload(library(error),[must_be/2]). 44:- autoload(library(lists),
45 [ append/3, list_to_set/2, reverse/2, delete/3, sum_list/2,
46 nth1/3, max_list/2
47 ]). 48:- autoload(library(option),[select_option/4,merge_options/3,option/3]). 49:- autoload(library(sgml),[xml_is_dom/1,load_html/3]). 50:- autoload(library(lynx/format),[format_paragraph/2,trim_line/2]). 51:- autoload(library(lynx/html_style),
52 [ element_css/3, css_block_options/5, css_inline_options/3,
53 attrs_classes/2, style_css_attrs/2
54 ]). 55
69
70html_text(Input) :-
71 html_text(Input, []).
72
73html_text(Input, Options) :-
74 ( xml_is_dom(Input)
75 -> DOM = Input
76 ; load_html(Input, DOM, Options)
77 ),
78 default_state(State0),
79 state_options(Options, State0, State),
80 init_nl,
81 format_dom(DOM, State).
82
83state_options([], State, State).
84state_options([H|T], State0, State) :-
85 H =.. [Key,Value],
86 ( fmt_option(Key, Type, _Default)
87 -> must_be(Type, Value),
88 State1 = State0.put(Key,Value)
89 ; State1 = State0
90 ),
91 state_options(T, State1, State).
92
93fmt_option(margin_left, integer, 0).
94fmt_option(margin_right, integer, 0).
95fmt_option(text_align, oneof([justify, left]), justify).
96fmt_option(width, between(10,1000), 72).
97
98default_state(State) :-
99 findall(Key-Value, fmt_option(Key, _, Value), Pairs),
100 dict_pairs(Dict, _, Pairs),
101 State = Dict.put(_{ style:[], list:[]}).
102
106
107format_dom([], _) :-
108 !.
109format_dom([H|T], State) :-
110 format_dom(H, State),
111 !,
112 format_dom(T, State).
113format_dom(Content, State) :-
114 Content = [H0|_],
115 \+ is_block_element(H0),
116 !,
117 ( append(Inline, [H|T], Content),
118 is_block_element(H)
119 -> true
120 ; Inline = Content
121 ),
122 format_dom(element(p, [], Inline), State),
123 format_dom([H|T], State).
124format_dom(element(html, _, Content), State) :-
125 !,
126 format_dom(Content, State).
127format_dom(element(head, _, _), _) :-
128 !.
129format_dom(element(body, _, Content), State) :-
130 !,
131 format_dom(Content, State).
132format_dom(element(E, Attrs, Content), State) :-
133 !,
134 ( format_element(E, Attrs, Content, State)
135 -> true
136 ; debug(format(html), 'Skipped block element ~q', [E])
137 ).
138
139format_element(pre, Attrs, [Content], State) :-
140 !,
141 block_element(pre, Attrs, Top-Bottom, BlockAttrs, Style),
142 update_style(Style, State, State1),
143 ask_nl(Top),
144 emit_code(Content, BlockAttrs, State1),
145 ask_nl(Bottom).
146format_element(table, Attrs, Content, State) :-
147 !,
148 block_element(table, Attrs, Top-Bottom, BlockAttrs, Style),
149 update_style(Style, State, State1),
150 state_par_properties(State1, BlockAttrs, BlockOptions),
151 ask_nl(Top),
152 emit_nl,
153 format_table(Content, Attrs, BlockOptions, State1),
154 ask_nl(Bottom).
155format_element(hr, Attrs, _, State) :-
156 !,
157 block_element(hr, Attrs, Top-Bottom, BlockAttrs, Style),
158 update_style(Style, State, State1),
159 state_par_properties(State1, BlockAttrs, BlockOptions),
160 ask_nl(Top),
161 emit_nl,
162 emit_hr(Attrs, BlockOptions, State1),
163 ask_nl(Bottom).
164format_element(Elem, Attrs, Content, State) :-
165 block_element(Elem, Attrs, Top-Bottom, BlockAttrs, Style),
166 !,
167 update_style(Style, State, State1),
168 block_words(Content, SubBlocks, Words, State1),
169 ( Words == []
170 -> true
171 ; ask_nl(Top),
172 emit_block(Words, BlockAttrs, State1),
173 ask_nl(Bottom)
174 ),
175 ( SubBlocks \== []
176 -> update_state_par_properties(BlockAttrs, State1, State2),
177 format_dom(SubBlocks, State2)
178 ; true
179 ).
180format_element(Elem, Attrs, Content, State) :-
181 list_element(Elem, Attrs, Top-Bottom, State, State1),
182 !,
183 open_list(Elem, State1, State2),
184 ask_nl(Top),
185 format_list(Content, Elem, 1, State2),
186 ask_nl(Bottom).
187format_element(Elem, Attrs, Content, State) :-
188 format_list_element(element(Elem, Attrs, Content), none, 0, State).
189
193
194block_element(El, Attrs, Margins, ParOptions, Style) :-
195 block_element(El, Margins0, ParOptions0, Style0),
196 ( nonvar(Attrs),
197 element_css(El, Attrs, CSS)
198 -> css_block_options(CSS, Margins0, Margins, ParOptions, Style1),
199 append(Style1, Style0, Style2),
200 list_to_set(Style2, Style)
201 ; Margins = Margins0,
202 ParOptions = ParOptions0,
203 Style = Style0
204 ).
205
206block_element(p, 1-2, [], []).
207block_element(div, 1-1, [], []).
208block_element(hr, 1-1, [], []).
209block_element(h1, 2-2, [], [bold]).
210block_element(h2, 2-2, [], [bold]).
211block_element(h3, 2-2, [], [bold]).
212block_element(h4, 2-2, [], [bold]).
213block_element(pre, 2-2, [], []).
214block_element(blockquote, 2-2, [margin_left(4), margin_right(4)], []).
215block_element(table, 2-2, [], []).
216
217list_element(ul, _, Margins, State0, State) :-
218 margins(4, 4, State0, State),
219 list_level_margins(State, Margins).
220list_element(ol, _, Margins, State0, State) :-
221 margins(4, 4, State0, State),
222 list_level_margins(State, Margins).
223list_element(dl, _, 2-2, State, State).
224
225list_element(ul).
226list_element(ol).
227list_element(dl).
228
229list_level_margins(State, 2-2) :-
230 nonvar(State),
231 State.get(list) == [],
232 !.
233list_level_margins(_, 0-0).
234
235format_list([], _, _, _).
236format_list([H|T], Type, Nth, State) :-
237 format_list_element(H, Type, Nth, State),
238 ( T == []
239 -> true
240 ; Nth1 is Nth + 1,
241 format_list(T, Type, Nth1, State)
242 ).
243
244format_list_element(element(LE, Attrs, Content), Type, Nth, State) :-
245 setup_list_element(LE, Attrs, Type, Nth, ListParProps, State, State1),
246 block_words(Content, Blocks, Words, State1),
247 emit_block(Words, ListParProps, State1),
248 ( Blocks \== []
249 -> ask_nl(2), 250 update_state_par_properties(ListParProps, State1, State2),
251 format_dom(Blocks, State2)
252 ; true
253 ).
254
255setup_list_element(li, _Attrs, _Type, Nth, ListParProps, State, State) :-
256 list_par_properties(State.list, Nth, ListParProps).
257setup_list_element(dt, _Attrs, _Type, _Nth, [], State, State2) :-
258 margins(0, 0, State, State1),
259 update_style([bold], State1, State2).
260setup_list_element(dd, _Attrs, _Type, _Nth, [], State, State1) :-
261 margins(4, 0, State, State1).
262
263list_item_element(li).
264list_item_element(dt).
265list_item_element(dd).
266
267list_par_properties([ul|_More], _, [bullet('\u2022')]).
268list_par_properties([ol|_More], N, [bullet(N)]).
269
270
274
275block_words(Content, RC, Words, State) :-
276 phrase(bwords(Content, RC, State), Words0),
277 join_whitespace(Words0, Words1),
278 trim_line(Words1, Words).
279
280bwords([], [], _) -->
281 !.
282bwords([H|T], Rest, _State) -->
283 { var(Rest),
284 is_block_element(H),
285 !,
286 Rest = [H|T]
287 }.
288bwords([H|T], Rest, State) -->
289 !,
290 bwordsel(H, State),
291 bwords(T, Rest, State).
292
293is_block_element(element(E,_,_)) :-
294 ( block_element(E, _, _, _)
295 ; list_element(E)
296 ; list_item_element(E)
297 ),
298 debug(format(html), 'Found block ~q', [E]),
299 !.
300
301bwordsel(element(Elem, Attrs, Content), State) -->
302 { styled_inline(Elem, Attrs, Margins, Style),
303 !,
304 update_style(Style, State, State1)
305 },
306 left_margin(Margins),
307 bwords(Content, [], State1),
308 right_margin(Margins).
309bwordsel(element(br, _, _), _State) -->
310 [br([])].
311bwordsel(CDATA, State) -->
312 { atomic(CDATA),
313 !,
314 split_string(CDATA, " \n\t\r", "", Words)
315 },
316 words(Words, State).
317bwordsel(element(Elem, _Attrs, _Content), _State) -->
318 { debug(format(html), 'Skipped inline element ~q', [Elem]) }.
319
320left_margin(0-_) --> !.
321left_margin(N-_) --> [b(N,_)].
322
323right_margin(_-0) --> !.
324right_margin(_-N) --> [b(N,_)].
325
326styled_inline(El, Attrs, Margins, Style) :-
327 styled_inline(El, Style0),
328 ( nonvar(Attrs),
329 element_css(El, Attrs, CSS)
330 -> css_inline_options(CSS, Margins, Style1),
331 append(Style1, Style0, Style2),
332 list_to_set(Style2, Style)
333 ; Style = Style0
334 ).
335
336styled_inline(b, [bold]).
337styled_inline(strong, [bold]).
338styled_inline(em, [bold]).
339styled_inline(span, []).
340styled_inline(i, [underline]).
341styled_inline(a, [underline]).
342styled_inline(var, []).
343styled_inline(code, []).
344
349
350words([], _) --> [].
351words([""|T0], State) -->
352 !,
353 { skip_leading_spaces(T0, T) },
354 space,
355 words(T, State).
356words([H|T], State) -->
357 word(H, State),
358 ( {T==[]}
359 -> []
360 ; { skip_leading_spaces(T, T1) },
361 space,
362 words(T1, State)
363 ).
364
365skip_leading_spaces([""|T0], T) :-
366 !,
367 skip_leading_spaces(T0, T).
368skip_leading_spaces(L, L).
369
370word(W, State) -->
371 { string_length(W, Len),
372 ( Style = State.get(style)
373 -> true
374 ; Style = []
375 )
376 },
377 [w(W, Len, Style)].
378
379space -->
380 [b(1,_)].
381
385
386join_whitespace([], []).
387join_whitespace([H0|T0], [H|T]) :-
388 join_whitespace(H0, H, T0, T1),
389 !,
390 join_whitespace(T1, T).
391join_whitespace([H|T0], [H|T]) :-
392 join_whitespace(T0, T).
393
394join_whitespace(b(Len0,_), b(Len,_), T0, T) :-
395 take_whitespace(T0, T, Len0, Len).
396
397take_whitespace([b(Len1,_)|T0], T, Len0, Len) :-
398 !,
399 Len2 is max(Len1,Len0),
400 take_whitespace(T0, T, Len2, Len).
401take_whitespace(L, L, Len, Len).
402
403
404 407
411
412update_style([], State, State) :-
413 !.
414update_style(Extra, State0, State) :-
415 ( get_dict(style, State0, Style0, State, Style)
416 -> add_style(Extra, Style0, Style)
417 ; add_style(Extra, [], Style),
418 put_dict(style, State0, Style, State)
419 ).
420
421add_style(Extra, Style0, Style) :-
422 reverse(Extra, RevExtra),
423 foldl(add1_style, RevExtra, Style0, Style).
424
428
429add1_style(New, Style0, Style) :-
430 ( style_overrides(New, Add, Overrides)
431 -> delete_all(Overrides, Style0, Style1),
432 append(Add, Style1, Style)
433 ; Style = [New|Style0]
434 ).
435
436delete_all([], List, List).
437delete_all([H|T], List0, List) :-
438 delete(List0, H, List1),
439 delete_all(T, List1, List).
440
441style_overrides(normal, [], [bold]).
442style_overrides(fg(C), [fg(C)], [fg(_), hfg(_)]).
443style_overrides(bg(C), [bg(C)], [bg(_), hbg(_)]).
444style_overrides(underline(false), [], [underline]).
445
446margins(Left, Right, State0, State) :-
447 _{ margin_left:ML0, margin_right:MR0 } >:< State0,
448 ML is ML0 + Left,
449 MR is MR0 + Right,
450 State = State0.put(_{margin_left:ML, margin_right:MR}).
451
452open_list(Type, State0, State) :-
453 get_dict(list, State0, Lists, State, [Type|Lists]).
454
455update_state_par_properties([], State, State).
456update_state_par_properties([H|T], State0, State) :-
457 H =.. [ Key, Value ],
458 State1 = State0.put(Key,Value),
459 update_state_par_properties(T, State1, State).
460
465
466state_par_properties(State, Props) :-
467 Props0 = [ margin_left(LM),
468 margin_right(RM),
469 text_align(TA),
470 width(W),
471 pad(Pad)
472 ],
473 _{margin_left:LM, margin_right:RM, text_align:TA, width:W,
474 pad:Pad} >:< State,
475 filled_par_props(Props0, Props).
476
477filled_par_props([], []).
478filled_par_props([H|T0], [H|T]) :-
479 arg(1, H, A),
480 nonvar(A),
481 !,
482 filled_par_props(T0, T).
483filled_par_props([_|T0], T) :-
484 filled_par_props(T0, T).
485
486
487state_par_properties(State, Options, BlockOptions) :-
488 state_par_properties(State, Options0),
489 foldl(merge_par_option, Options, Options0, BlockOptions).
490
491merge_par_option(margin_left(ML0), Options0, [margin_left(ML)|Options1]) :-
492 !,
493 select_option(margin_left(ML1), Options0, Options1, 0),
494 ML is ML0+ML1.
495merge_par_option(margin_right(MR0), Options0, [margin_right(MR)|Options1]) :-
496 !,
497 select_option(margin_right(MR1), Options0, Options1, 0),
498 MR is MR0+MR1.
499merge_par_option(Opt, Options0, Options) :-
500 merge_options([Opt], Options0, Options).
501
507
508emit_block([], _, _) :-
509 !.
510emit_block(Words, Options, State) :-
511 state_par_properties(State, Options, BlockOptions),
512 ask_nl(1),
513 emit_nl,
514 format_paragraph(Words, BlockOptions),
515 ask_nl(1).
516
522
523init_nl :-
524 nb_setval(nl_pending, start).
525
526init_nl(Old) :-
527 ( nb_current(nl_pending, Old)
528 -> true
529 ; Old = []
530 ),
531 nb_setval(nl_pending, start).
532exit_nl(Old) :-
533 nb_setval(nl_pending, Old).
534
535ask_nl(N) :-
536 ( nb_current(nl_pending, N0)
537 -> ( N0 == start
538 -> true
539 ; integer(N0)
540 -> N1 is max(N0, N),
541 nb_setval(nl_pending, N1)
542 ; nb_setval(nl_pending, N)
543 )
544 ; nb_setval(nl_pending, N)
545 ).
546
547emit_nl :-
548 ( nb_current(nl_pending, N),
549 integer(N)
550 -> forall(between(1,N,_), nl)
551 ; true
552 ),
553 nb_setval(nl_pending, 0).
554
555
556 559
561
562emit_code(Content, BlockAttrs, State) :-
563 Style = State.style,
564 split_string(Content, "\n", "", Lines),
565 option(margin_left(LM0), BlockAttrs, 4),
566 LM is LM0+State.margin_left,
567 ask_nl(1),
568 emit_nl,
569 emit_code_lines(Lines, 1, LM, Style),
570 ask_nl(1).
571
572emit_code_lines([], _, _, _).
573emit_code_lines([H|T], LineNo, LM, Style) :-
574 emit_code_line(H, LineNo, LM, Style),
575 LineNo1 is LineNo + 1,
576 emit_code_lines(T, LineNo1, LM, Style).
577
578emit_code_line(Line, _LineNo, LM, Style) :-
579 emit_nl,
580 emit_indent(LM),
581 ( Style == []
582 -> write(Line)
583 ; ansi_format(Style, '~s', [Line])
584 ),
585 ask_nl(1).
586
587emit_indent(N) :-
588 forall(between(1, N, _),
589 put_char(' ')).
590
591
592 595
597
598format_table(Content, Attrs, BlockAttrs, State) :-
599 tty_state(TTY),
600 option(margin_left(ML), BlockAttrs, 0),
601 option(margin_right(MR), BlockAttrs, 0),
602 MaxTableWidth is State.width - ML - MR,
603 table_cell_state(Attrs, State, CellState),
604 phrase(rows(Content), Rows),
605 columns(Rows, Columns),
606 maplist(auto_column_width(CellState.put(tty,false)), Columns, Widths),
607 column_widths(Widths, MaxTableWidth, ColWidths),
608 maplist(format_row(ColWidths, CellState.put(tty,TTY), ML), Rows).
609
610tty_state(TTY) :-
611 stream_property(current_output, tty(true)),
612 !,
613 TTY = true.
614tty_state(false).
615
616
621
622column_widths(Widths, MaxTableWidth, Widths) :-
623 sum_list(Widths, AutoWidth),
624 AutoWidth =< MaxTableWidth,
625 !.
626column_widths(AutoWidths, MaxTableWidth, Widths) :-
627 sort(0, >=, AutoWidths, Sorted),
628 append(Wrapped, Keep, Sorted),
629 sum_list(Keep, KeepWidth),
630 KeepWidth < MaxTableWidth/2,
631 length(Wrapped, NWrapped),
632 WideWidth is round((MaxTableWidth-KeepWidth)/NWrapped),
633 ( [KeepW|_] = Keep
634 -> true
635 ; KeepW = 0
636 ),
637 !,
638 maplist(truncate_column(KeepW,WideWidth), AutoWidths, Widths).
639
640truncate_column(Keep, WideWidth, AutoWidth, Width) :-
641 ( AutoWidth =< Keep
642 -> Width = AutoWidth
643 ; Width = WideWidth
644 ).
645
646table_cell_state(Attrs, State, CellState) :-
647 ( element_css(table, Attrs, CSS)
648 -> true
649 ; CSS = []
650 ),
651 option(padding_left(PL), CSS, 1),
652 option(padding_right(PR), CSS, 1),
653 CellState = State.put(_{margin_left:PL, margin_right:PR}).
654
655
657
658rows([]) --> [].
659rows([H|T]) --> rows(H), rows(T).
660rows([element(tbody,_,Content)|T]) --> rows(Content), rows(T).
661rows([element(tr,Attrs,Columns)|T]) --> [row(Columns, Attrs)], rows(T).
662
667
668columns(Rows, Columns) :-
669 columns(Rows, 1, Columns).
670
671columns(Rows, I, Columns) :-
672 maplist(row_column(I, Found), Rows, H),
673 ( Found == true
674 -> Columns = [H|T],
675 I2 is I + 1,
676 columns(Rows, I2, T)
677 ; Columns = []
678 ).
679
680row_column(I, Found, row(Columns, _Attrs), Cell) :-
681 ( nth1(I, Columns, Cell)
682 -> Found = true
683 ; Cell = element(td,[],[])
684 ).
685
686auto_column_width(State, Col, Width) :-
687 maplist(auto_cell_width(State), Col, Widths),
688 max_list(Widths, Width).
689
690auto_cell_width(State, Cell, Width) :-
691 cell_colspan(Cell, 1),
692 !,
693 format_cell_to_string(Cell, 1_000, State, String),
694 split_string(String, "\n", "", Lines),
695 maplist(string_length, Lines, LineW),
696 max_list(LineW, Width0),
697 Width is Width0 + State.margin_right.
698auto_cell_width(_, _, 0).
699
703
704format_row(ColWidths, State, MarginLeft, Row) :-
705 hrule(Row, ColWidths, MarginLeft),
706 format_cells(ColWidths, CWSpanned, 1, Row, State, Cells),
707 format_row_lines(1, CWSpanned, Cells, MarginLeft).
708
709hrule(row(_, Attrs), ColWidths, MarginLeft) :-
710 attrs_classes(Attrs, Classes),
711 memberchk(hline, Classes),
712 !,
713 sum_list(ColWidths, RuleLen),
714 format('~N~t~*|~`-t~*+', [MarginLeft, RuleLen]).
715hrule(_, _, _).
716
717format_row_lines(LineNo, Widths, Cells, MarginLeft) :-
718 nth_row_line(Widths, 1, LineNo, Cells, CellLines, Found),
719 ( Found == true
720 -> emit_nl,
721 emit_indent(MarginLeft),
722 maplist(emit_cell_line, CellLines),
723 ask_nl(1),
724 LineNo1 is LineNo + 1,
725 format_row_lines(LineNo1, Widths, Cells, MarginLeft)
726 ; true
727 ).
728
729emit_cell_line(Line-Pad) :-
730 write(Line),
731 forall(between(1,Pad,_), put_char(' ')).
732
733nth_row_line([], _, _, _, [], _).
734nth_row_line([ColW|CWT], CellNo, LineNo, Cells, [CellLine-Pad|ColLines],
735 Found) :-
736 nth1(CellNo, Cells, CellLines),
737 ( nth1(LineNo, CellLines, CellLine)
738 -> Found = true,
739 Pad = 0
740 ; CellLine = '', Pad = ColW
741 ),
742 CellNo1 is CellNo + 1,
743 nth_row_line(CWT, CellNo1, LineNo, Cells, ColLines, Found).
744
745
751
752format_cells([], [], _, _, _, []) :- !.
753format_cells(CWidths, [HW|TW], Column, Row, State, [HC|TC]) :-
754 Row = row(Columns, _Attrs),
755 nth1(Column, Columns, Cell),
756 cell_colspan(Cell, CWidths, HW, TW0),
757 cell_align(Cell, Align),
758 format_cell_to_string(Cell, HW, State.put(_{pad:' ', text_align:Align}), String),
759 split_string(String, "\n", "", HC),
760 Column1 is Column+1,
761 format_cells(TW0, TW, Column1, Row, State, TC).
762
763cell_colspan(Cell, CWidths, HW, TW) :-
764 cell_colspan(Cell, Span),
765 length(SpanW, Span),
766 append(SpanW, TW, CWidths),
767 sum_list(SpanW, HW).
768
769cell_colspan(element(_,Attrs,_), Span) :-
770 ( memberchk(colspan=SpanA, Attrs),
771 atom_number(SpanA, SpanN)
772 -> Span = SpanN
773 ; Span = 1
774 ).
775
781
782cell_align(element(_,Attrs,_), Align) :-
783 ( memberchk(align=AlignA, Attrs)
784 -> Align = AlignA
785 ; memberchk(style=Style, Attrs),
786 style_css_attrs(Style, Props),
787 memberchk('text-align'(AlignA), Props)
788 -> Align = AlignA
789 ; Align = left
790 ).
791
792
796
797format_cell_to_string(element(_,_,[]), ColWidth, State, String) :-
798 Pad = State.get(pad),
799 !,
800 length(Chars, ColWidth),
801 maplist(=(Pad), Chars),
802 atomics_to_string(Chars, String).
803format_cell_to_string(Cell, ColWidth, State, String) :-
804 setup_call_cleanup(
805 init_nl(NlState),
806 with_output_to(
807 string(String),
808 format_cell(Cell, ColWidth, State)),
809 exit_nl(NlState)).
810
811format_cell(element(E, _Attrs, Content), ColWidth, State) :-
812 set_stream(current_output, tty(State.tty)),
813 cell_element(E, Style),
814 update_style(Style, State.put(width, ColWidth), CellState),
815 block_words(Content, Blocks, Words, CellState),
816 emit_block(Words, [], CellState),
817 ( Blocks \== []
818 -> format_dom(Blocks, CellState)
819 ; true
820 ).
821
822cell_element(td, [normal]).
823cell_element(th, [bold]).
824
825
829
830emit_hr(_Attrs, BlockAttrs, State) :-
831 option(margin_left(ML), BlockAttrs, 0),
832 option(margin_right(MR), BlockAttrs, 0),
833 RuleWidth is State.width - ML - MR,
834 Style = State.style,
835 emit_indent(ML),
836 ( Style == []
837 -> format('~|~*t~*+', [0'-, RuleWidth])
838 ; ansi_format(Style, '~|~*t~*+', [0'-, RuleWidth])
839 )