mirror of
https://github.com/jart/cosmopolitan.git
synced 2025-02-15 10:47:56 +00:00
This change gets the Python codebase into a state where it conforms to the conventions of this codebase. It's now possible to include headers from Python, without worrying about ordering. Python has traditionally solved that problem by "diamonding" everything in Python.h, but that's problematic since it means any change to any Python header invalidates all the build artifacts. Lastly it makes tooling not work. Since it is hard to explain to Emacs when I press C-c C-h to add an import line it shouldn't add the header that actually defines the symbol, and instead do follow the nonstandard Python convention. Progress has been made on letting Python load source code from the zip executable structure via the standard C library APIs. System calss now recognizes zip!FILENAME alternative URIs as equivalent to zip:FILENAME since Python uses colon as its delimiter. Some progress has been made on embedding the notice license terms into the Python object code. This is easier said than done since Python has an extremely complicated ownership story. - Some termios APIs have been added - Implement rewinddir() dirstream API - GetCpuCount() API added to Cosmopolitan Libc - More bugs in Cosmopolitan Libc have been fixed - zipobj.com now has flags for mangling the path - Fixed bug a priori with sendfile() on certain BSDs - Polyfill F_DUPFD and F_DUPFD_CLOEXEC across platforms - FIOCLEX / FIONCLEX now polyfilled for fast O_CLOEXEC changes - APE now supports a hybrid solution to no-self-modify for builds - Many BSD-only magnums added, e.g. O_SEARCH, O_SHLOCK, SF_NODISKIO
257 lines
9.5 KiB
Python
257 lines
9.5 KiB
Python
# Copyright 2004-2005 Elemental Security, Inc. All Rights Reserved.
|
|
# Licensed to PSF under a Contributor Agreement.
|
|
|
|
"""Convert graminit.[ch] spit out by pgen to Python code.
|
|
|
|
Pgen is the Python parser generator. It is useful to quickly create a
|
|
parser from a grammar file in Python's grammar notation. But I don't
|
|
want my parsers to be written in C (yet), so I'm translating the
|
|
parsing tables to Python data structures and writing a Python parse
|
|
engine.
|
|
|
|
Note that the token numbers are constants determined by the standard
|
|
Python tokenizer. The standard token module defines these numbers and
|
|
their names (the names are not used much). The token numbers are
|
|
hardcoded into the Python tokenizer and into pgen. A Python
|
|
implementation of the Python tokenizer is also available, in the
|
|
standard tokenize module.
|
|
|
|
On the other hand, symbol numbers (representing the grammar's
|
|
non-terminals) are assigned by pgen based on the actual grammar
|
|
input.
|
|
|
|
Note: this module is pretty much obsolete; the pgen module generates
|
|
equivalent grammar tables directly from the Grammar.txt input file
|
|
without having to invoke the Python pgen C program.
|
|
|
|
"""
|
|
|
|
# Python imports
|
|
import re
|
|
|
|
# Local imports
|
|
from pgen2 import grammar, token
|
|
|
|
|
|
class Converter(grammar.Grammar):
|
|
"""Grammar subclass that reads classic pgen output files.
|
|
|
|
The run() method reads the tables as produced by the pgen parser
|
|
generator, typically contained in two C files, graminit.h and
|
|
graminit.c. The other methods are for internal use only.
|
|
|
|
See the base class for more documentation.
|
|
|
|
"""
|
|
|
|
def run(self, graminit_h, graminit_c):
|
|
"""Load the grammar tables from the text files written by pgen."""
|
|
self.parse_graminit_h(graminit_h)
|
|
self.parse_graminit_c(graminit_c)
|
|
self.finish_off()
|
|
|
|
def parse_graminit_h(self, filename):
|
|
"""Parse the .h file written by pgen. (Internal)
|
|
|
|
This file is a sequence of #define statements defining the
|
|
nonterminals of the grammar as numbers. We build two tables
|
|
mapping the numbers to names and back.
|
|
|
|
"""
|
|
try:
|
|
f = open(filename)
|
|
except OSError as err:
|
|
print("Can't open %s: %s" % (filename, err))
|
|
return False
|
|
self.symbol2number = {}
|
|
self.number2symbol = {}
|
|
lineno = 0
|
|
for line in f:
|
|
lineno += 1
|
|
mo = re.match(r"^#define\s+(\w+)\s+(\d+)$", line)
|
|
if not mo and line.strip():
|
|
print("%s(%s): can't parse %s" % (filename, lineno,
|
|
line.strip()))
|
|
else:
|
|
symbol, number = mo.groups()
|
|
number = int(number)
|
|
assert symbol not in self.symbol2number
|
|
assert number not in self.number2symbol
|
|
self.symbol2number[symbol] = number
|
|
self.number2symbol[number] = symbol
|
|
return True
|
|
|
|
def parse_graminit_c(self, filename):
|
|
"""Parse the .c file written by pgen. (Internal)
|
|
|
|
The file looks as follows. The first two lines are always this:
|
|
|
|
#include "third_party/python/Include/pgenheaders.h"
|
|
#include "third_party/python/Include/grammar.h"
|
|
|
|
After that come four blocks:
|
|
|
|
1) one or more state definitions
|
|
2) a table defining dfas
|
|
3) a table defining labels
|
|
4) a struct defining the grammar
|
|
|
|
A state definition has the following form:
|
|
- one or more arc arrays, each of the form:
|
|
static arc arcs_<n>_<m>[<k>] = {
|
|
{<i>, <j>},
|
|
...
|
|
};
|
|
- followed by a state array, of the form:
|
|
static state states_<s>[<t>] = {
|
|
{<k>, arcs_<n>_<m>},
|
|
...
|
|
};
|
|
|
|
"""
|
|
try:
|
|
f = open(filename)
|
|
except OSError as err:
|
|
print("Can't open %s: %s" % (filename, err))
|
|
return False
|
|
# The code below essentially uses f's iterator-ness!
|
|
lineno = 0
|
|
|
|
# Expect the two #include lines
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == '#include "pgenheaders.h"\n', (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == '#include "grammar.h"\n', (lineno, line)
|
|
|
|
# Parse the state definitions
|
|
lineno, line = lineno+1, next(f)
|
|
allarcs = {}
|
|
states = []
|
|
while line.startswith("static arc "):
|
|
while line.startswith("static arc "):
|
|
mo = re.match(r"static arc arcs_(\d+)_(\d+)\[(\d+)\] = {$",
|
|
line)
|
|
assert mo, (lineno, line)
|
|
n, m, k = list(map(int, mo.groups()))
|
|
arcs = []
|
|
for _ in range(k):
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"\s+{(\d+), (\d+)},$", line)
|
|
assert mo, (lineno, line)
|
|
i, j = list(map(int, mo.groups()))
|
|
arcs.append((i, j))
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "};\n", (lineno, line)
|
|
allarcs[(n, m)] = arcs
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"static state states_(\d+)\[(\d+)\] = {$", line)
|
|
assert mo, (lineno, line)
|
|
s, t = list(map(int, mo.groups()))
|
|
assert s == len(states), (lineno, line)
|
|
state = []
|
|
for _ in range(t):
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"\s+{(\d+), arcs_(\d+)_(\d+)},$", line)
|
|
assert mo, (lineno, line)
|
|
k, n, m = list(map(int, mo.groups()))
|
|
arcs = allarcs[n, m]
|
|
assert k == len(arcs), (lineno, line)
|
|
state.append(arcs)
|
|
states.append(state)
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "};\n", (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
self.states = states
|
|
|
|
# Parse the dfas
|
|
dfas = {}
|
|
mo = re.match(r"static dfa dfas\[(\d+)\] = {$", line)
|
|
assert mo, (lineno, line)
|
|
ndfas = int(mo.group(1))
|
|
for i in range(ndfas):
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r'\s+{(\d+), "(\w+)", (\d+), (\d+), states_(\d+),$',
|
|
line)
|
|
assert mo, (lineno, line)
|
|
symbol = mo.group(2)
|
|
number, x, y, z = list(map(int, mo.group(1, 3, 4, 5)))
|
|
assert self.symbol2number[symbol] == number, (lineno, line)
|
|
assert self.number2symbol[number] == symbol, (lineno, line)
|
|
assert x == 0, (lineno, line)
|
|
state = states[z]
|
|
assert y == len(state), (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r'\s+("(?:\\\d\d\d)*")},$', line)
|
|
assert mo, (lineno, line)
|
|
first = {}
|
|
rawbitset = eval(mo.group(1))
|
|
for i, c in enumerate(rawbitset):
|
|
byte = ord(c)
|
|
for j in range(8):
|
|
if byte & (1<<j):
|
|
first[i*8 + j] = 1
|
|
dfas[number] = (state, first)
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "};\n", (lineno, line)
|
|
self.dfas = dfas
|
|
|
|
# Parse the labels
|
|
labels = []
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"static label labels\[(\d+)\] = {$", line)
|
|
assert mo, (lineno, line)
|
|
nlabels = int(mo.group(1))
|
|
for i in range(nlabels):
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r'\s+{(\d+), (0|"\w+")},$', line)
|
|
assert mo, (lineno, line)
|
|
x, y = mo.groups()
|
|
x = int(x)
|
|
if y == "0":
|
|
y = None
|
|
else:
|
|
y = eval(y)
|
|
labels.append((x, y))
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "};\n", (lineno, line)
|
|
self.labels = labels
|
|
|
|
# Parse the grammar struct
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "grammar _PyParser_Grammar = {\n", (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"\s+(\d+),$", line)
|
|
assert mo, (lineno, line)
|
|
ndfas = int(mo.group(1))
|
|
assert ndfas == len(self.dfas)
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "\tdfas,\n", (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"\s+{(\d+), labels},$", line)
|
|
assert mo, (lineno, line)
|
|
nlabels = int(mo.group(1))
|
|
assert nlabels == len(self.labels), (lineno, line)
|
|
lineno, line = lineno+1, next(f)
|
|
mo = re.match(r"\s+(\d+)$", line)
|
|
assert mo, (lineno, line)
|
|
start = int(mo.group(1))
|
|
assert start in self.number2symbol, (lineno, line)
|
|
self.start = start
|
|
lineno, line = lineno+1, next(f)
|
|
assert line == "};\n", (lineno, line)
|
|
try:
|
|
lineno, line = lineno+1, next(f)
|
|
except StopIteration:
|
|
pass
|
|
else:
|
|
assert 0, (lineno, line)
|
|
|
|
def finish_off(self):
|
|
"""Create additional useful structures. (Internal)."""
|
|
self.keywords = {} # map from keyword strings to arc labels
|
|
self.tokens = {} # map from numeric token values to arc labels
|
|
for ilabel, (type, value) in enumerate(self.labels):
|
|
if type == token.NAME and value is not None:
|
|
self.keywords[value] = ilabel
|
|
elif value is None:
|
|
self.tokens[type] = ilabel
|