| import collections | |
| def get_main_text_font(pdf_docs): | |
| font_names = collections.Counter() | |
| for page in pdf_docs: | |
| blocks = page.get_text('dict')['blocks'] | |
| if blocks is not None: | |
| for block in blocks: | |
| lines = block.get('lines') | |
| if lines is not None: | |
| for line in lines: | |
| span_font = [(span['font'], len(span['text'])) for span in line['spans'] if | |
| 'font' in span and len(span['text']) > 0] | |
| if span_font: | |
| # main_text_font应该用基于字数最多的字体而不是span级别的统计 | |
| # font_names.append(font_name for font_name in span_font) | |
| # block_fonts.append(font_name for font_name in span_font) | |
| for font, count in span_font: | |
| font_names[font] += count | |
| main_text_font = font_names.most_common(1)[0][0] | |
| return main_text_font | |