Make improvements

This commit is contained in:
Justine Tunney 2020-12-01 03:43:40 -08:00
parent 3e4fd4b0ad
commit e44a0cf6f8
256 changed files with 23100 additions and 2294 deletions

21
third_party/chibicc/LICENSE vendored Normal file
View file

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2019 Rui Ueyama
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

720
third_party/chibicc/chibicc.c vendored Normal file
View file

@ -0,0 +1,720 @@
#include "third_party/chibicc/chibicc.h"
typedef enum {
FILE_NONE,
FILE_C,
FILE_ASM,
FILE_OBJ,
FILE_AR,
FILE_DSO,
} FileType;
StringArray include_paths;
bool opt_fcommon = true;
bool opt_fpic;
static FileType opt_x;
static StringArray opt_include;
static bool opt_E;
static bool opt_M;
static bool opt_MD;
static bool opt_MMD;
static bool opt_MP;
static bool opt_S;
static bool opt_c;
static bool opt_cc1;
static bool opt_hash_hash_hash;
static bool opt_static;
static bool opt_shared;
static char *opt_MF;
static char *opt_MT;
static char *opt_o;
static StringArray ld_extra_args;
static StringArray std_include_paths;
char *base_file;
static char *output_file;
static StringArray input_paths;
static char **tmpfiles;
static void usage(int status) {
fprintf(stderr, "chibicc [ -o <path> ] <file>\n");
exit(status);
}
static bool take_arg(char *arg) {
char *x[] = {
"-o", "-I", "-idirafter", "-include", "-x", "-MF", "-MT", "-Xlinker",
};
for (int i = 0; i < sizeof(x) / sizeof(*x); i++)
if (!strcmp(arg, x[i])) return true;
return false;
}
static void add_default_include_paths(char *argv0) {
// We expect that chibicc-specific include files are installed
// to ./include relative to argv[0].
char *buf = calloc(1, strlen(argv0) + 10);
sprintf(buf, "%s/include", dirname(strdup(argv0)));
strarray_push(&include_paths, buf);
// Add standard include paths.
strarray_push(&include_paths, ".");
// Keep a copy of the standard include paths for -MMD option.
for (int i = 0; i < include_paths.len; i++)
strarray_push(&std_include_paths, include_paths.data[i]);
}
static void define(char *str) {
char *eq = strchr(str, '=');
if (eq)
define_macro(strndup(str, eq - str), eq + 1);
else
define_macro(str, "1");
}
static FileType parse_opt_x(char *s) {
if (!strcmp(s, "c")) return FILE_C;
if (!strcmp(s, "assembler")) return FILE_ASM;
if (!strcmp(s, "none")) return FILE_NONE;
error("<command line>: unknown argument for -x: %s", s);
}
static char *quote_makefile(char *s) {
char *buf = calloc(1, strlen(s) * 2 + 1);
for (int i = 0, j = 0; s[i]; i++) {
switch (s[i]) {
case '$':
buf[j++] = '$';
buf[j++] = '$';
break;
case '#':
buf[j++] = '\\';
buf[j++] = '#';
break;
case ' ':
case '\t':
for (int k = i - 1; k >= 0 && s[k] == '\\'; k--) buf[j++] = '\\';
buf[j++] = '\\';
buf[j++] = s[i];
break;
default:
buf[j++] = s[i];
break;
}
}
return buf;
}
static void parse_args(int argc, char **argv) {
// Make sure that all command line options that take an argument
// have an argument.
for (int i = 1; i < argc; i++)
if (take_arg(argv[i]))
if (!argv[++i]) usage(1);
StringArray idirafter = {};
for (int i = 1; i < argc; i++) {
if (!strcmp(argv[i], "-###")) {
opt_hash_hash_hash = true;
continue;
}
if (!strcmp(argv[i], "-cc1")) {
opt_cc1 = true;
continue;
}
if (!strcmp(argv[i], "--help")) usage(0);
if (!strcmp(argv[i], "-o")) {
opt_o = argv[++i];
continue;
}
if (!strncmp(argv[i], "-o", 2)) {
opt_o = argv[i] + 2;
continue;
}
if (!strcmp(argv[i], "-S")) {
opt_S = true;
continue;
}
if (!strcmp(argv[i], "-fcommon")) {
opt_fcommon = true;
continue;
}
if (!strcmp(argv[i], "-fno-common")) {
opt_fcommon = false;
continue;
}
if (!strcmp(argv[i], "-c")) {
opt_c = true;
continue;
}
if (!strcmp(argv[i], "-E")) {
opt_E = true;
continue;
}
if (!strncmp(argv[i], "-I", 2)) {
strarray_push(&include_paths, argv[i] + 2);
continue;
}
if (!strcmp(argv[i], "-D")) {
define(argv[++i]);
continue;
}
if (!strncmp(argv[i], "-D", 2)) {
define(argv[i] + 2);
continue;
}
if (!strcmp(argv[i], "-U")) {
undef_macro(argv[++i]);
continue;
}
if (!strncmp(argv[i], "-U", 2)) {
undef_macro(argv[i] + 2);
continue;
}
if (!strcmp(argv[i], "-include")) {
strarray_push(&opt_include, argv[++i]);
continue;
}
if (!strcmp(argv[i], "-x")) {
opt_x = parse_opt_x(argv[++i]);
continue;
}
if (!strncmp(argv[i], "-x", 2)) {
opt_x = parse_opt_x(argv[i] + 2);
continue;
}
if (!strncmp(argv[i], "-l", 2) || !strncmp(argv[i], "-Wl,", 4)) {
strarray_push(&input_paths, argv[i]);
continue;
}
if (!strcmp(argv[i], "-Xlinker")) {
strarray_push(&ld_extra_args, argv[++i]);
continue;
}
if (!strcmp(argv[i], "-s")) {
strarray_push(&ld_extra_args, "-s");
continue;
}
if (!strcmp(argv[i], "-M")) {
opt_M = true;
continue;
}
if (!strcmp(argv[i], "-MF")) {
opt_MF = argv[++i];
continue;
}
if (!strcmp(argv[i], "-MP")) {
opt_MP = true;
continue;
}
if (!strcmp(argv[i], "-MT")) {
if (opt_MT == NULL)
opt_MT = argv[++i];
else
opt_MT = format("%s %s", opt_MT, argv[++i]);
continue;
}
if (!strcmp(argv[i], "-MD")) {
opt_MD = true;
continue;
}
if (!strcmp(argv[i], "-MQ")) {
if (opt_MT == NULL)
opt_MT = quote_makefile(argv[++i]);
else
opt_MT = format("%s %s", opt_MT, quote_makefile(argv[++i]));
continue;
}
if (!strcmp(argv[i], "-MMD")) {
opt_MD = opt_MMD = true;
continue;
}
if (!strcmp(argv[i], "-fpic") || !strcmp(argv[i], "-fPIC")) {
opt_fpic = true;
continue;
}
if (!strcmp(argv[i], "-cc1-input")) {
base_file = argv[++i];
continue;
}
if (!strcmp(argv[i], "-cc1-output")) {
output_file = argv[++i];
continue;
}
if (!strcmp(argv[i], "-idirafter")) {
strarray_push(&idirafter, argv[i++]);
continue;
}
if (!strcmp(argv[i], "-static")) {
opt_static = true;
strarray_push(&ld_extra_args, "-static");
continue;
}
if (!strcmp(argv[i], "-shared")) {
opt_shared = true;
strarray_push(&ld_extra_args, "-shared");
continue;
}
if (!strcmp(argv[i], "-L")) {
strarray_push(&ld_extra_args, "-L");
strarray_push(&ld_extra_args, argv[++i]);
continue;
}
if (!strncmp(argv[i], "-L", 2)) {
strarray_push(&ld_extra_args, "-L");
strarray_push(&ld_extra_args, argv[i] + 2);
continue;
}
if (!strcmp(argv[i], "-hashmap-test")) {
hashmap_test();
exit(0);
}
// These options are ignored for now.
if (!strncmp(argv[i], "-O", 2) || !strncmp(argv[i], "-W", 2) ||
!strncmp(argv[i], "-g", 2) || !strncmp(argv[i], "-std=", 5) ||
!strcmp(argv[i], "-ffreestanding") ||
!strcmp(argv[i], "-fno-builtin") ||
!strcmp(argv[i], "-fno-omit-frame-pointer") ||
!strcmp(argv[i], "-fno-stack-protector") ||
!strcmp(argv[i], "-fno-strict-aliasing") || !strcmp(argv[i], "-m64") ||
!strcmp(argv[i], "-mno-red-zone") || !strcmp(argv[i], "-w"))
continue;
if (argv[i][0] == '-' && argv[i][1] != '\0')
error("unknown argument: %s", argv[i]);
strarray_push(&input_paths, argv[i]);
}
for (int i = 0; i < idirafter.len; i++)
strarray_push(&include_paths, idirafter.data[i]);
if (input_paths.len == 0) error("no input files");
// -E implies that the input is the C macro language.
if (opt_E) opt_x = FILE_C;
}
static FILE *open_file(char *path) {
if (!path || strcmp(path, "-") == 0) return stdout;
FILE *out = fopen(path, "w");
if (!out) error("cannot open output file: %s: %s", path, strerror(errno));
return out;
}
static bool ends_with(char *p, char *q) {
int len1 = strlen(p);
int len2 = strlen(q);
return (len1 >= len2) && !strcmp(p + len1 - len2, q);
}
// Replace file extension
static char *replace_extn(char *tmpl, char *extn) {
char *filename = basename(strdup(tmpl));
int len1 = strlen(filename);
int len2 = strlen(extn);
char *buf = calloc(1, len1 + len2 + 2);
char *dot = strrchr(filename, '.');
if (dot) *dot = '\0';
sprintf(buf, "%s%s", filename, extn);
return buf;
}
static void cleanup(void) {
if (tmpfiles)
for (int i = 0; tmpfiles[i]; i++) unlink(tmpfiles[i]);
}
static char *create_tmpfile(void) {
char tmpl[] = "/tmp/chibicc-XXXXXX";
char *path = calloc(1, sizeof(tmpl));
memcpy(path, tmpl, sizeof(tmpl));
int fd = mkstemp(path);
if (fd == -1) error("mkstemp failed: %s", strerror(errno));
close(fd);
static int len = 2;
tmpfiles = realloc(tmpfiles, sizeof(char *) * len);
tmpfiles[len - 2] = path;
tmpfiles[len - 1] = NULL;
len++;
return path;
}
static void run_subprocess(char **argv) {
// If -### is given, dump the subprocess's command line.
if (opt_hash_hash_hash) {
fprintf(stderr, "%s", argv[0]);
for (int i = 1; argv[i]; i++) fprintf(stderr, " %s", argv[i]);
fprintf(stderr, "\n");
}
if (fork() == 0) {
// Child process. Run a new command.
execvp(argv[0], argv);
fprintf(stderr, "exec failed: %s: %s\n", argv[0], strerror(errno));
_exit(1);
}
// Wait for the child process to finish.
int status;
while (wait(&status) > 0)
;
if (status != 0) exit(1);
}
static void run_cc1(int argc, char **argv, char *input, char *output) {
char **args = calloc(argc + 10, sizeof(char *));
memcpy(args, argv, argc * sizeof(char *));
args[argc++] = "-cc1";
if (input) {
args[argc++] = "-cc1-input";
args[argc++] = input;
}
if (output) {
args[argc++] = "-cc1-output";
args[argc++] = output;
}
run_subprocess(args);
}
// Print tokens to stdout. Used for -E.
static void print_tokens(Token *tok) {
FILE *out = open_file(opt_o ? opt_o : "-");
int line = 1;
for (; tok->kind != TK_EOF; tok = tok->next) {
if (line > 1 && tok->at_bol) fprintf(out, "\n");
if (tok->has_space && !tok->at_bol) fprintf(out, " ");
fprintf(out, "%.*s", tok->len, tok->loc);
line++;
}
fprintf(out, "\n");
}
static bool in_std_include_path(char *path) {
for (int i = 0; i < std_include_paths.len; i++) {
char *dir = std_include_paths.data[i];
int len = strlen(dir);
if (strncmp(dir, path, len) == 0 && path[len] == '/') return true;
}
return false;
}
// If -M options is given, the compiler write a list of input files to
// stdout in a format that "make" command can read. This feature is
// used to automate file dependency management.
static void print_dependencies(void) {
char *path;
if (opt_MF)
path = opt_MF;
else if (opt_MD)
path = replace_extn(opt_o ? opt_o : base_file, ".d");
else if (opt_o)
path = opt_o;
else
path = "-";
FILE *out = open_file(path);
if (opt_MT)
fprintf(out, "%s:", opt_MT);
else
fprintf(out, "%s:", quote_makefile(replace_extn(base_file, ".o")));
File **files = get_input_files();
for (int i = 0; files[i]; i++) {
if (opt_MMD && in_std_include_path(files[i]->name)) continue;
fprintf(out, " \\\n %s", files[i]->name);
}
fprintf(out, "\n\n");
if (opt_MP) {
for (int i = 1; files[i]; i++) {
if (opt_MMD && in_std_include_path(files[i]->name)) continue;
fprintf(out, "%s:\n\n", quote_makefile(files[i]->name));
}
}
}
static Token *must_tokenize_file(char *path) {
Token *tok = tokenize_file(path);
if (!tok) error("%s: %s", path, strerror(errno));
return tok;
}
static Token *append_tokens(Token *tok1, Token *tok2) {
if (!tok1 || tok1->kind == TK_EOF) return tok2;
Token *t = tok1;
while (t->next->kind != TK_EOF) t = t->next;
t->next = tok2;
return tok1;
}
static void cc1(void) {
Token *tok = NULL;
// Process -include option
for (int i = 0; i < opt_include.len; i++) {
char *incl = opt_include.data[i];
char *path;
if (file_exists(incl)) {
path = incl;
} else {
path = search_include_paths(incl);
if (!path) error("-include: %s: %s", incl, strerror(errno));
}
Token *tok2 = must_tokenize_file(path);
tok = append_tokens(tok, tok2);
}
// Tokenize and parse.
Token *tok2 = must_tokenize_file(base_file);
tok = append_tokens(tok, tok2);
tok = preprocess(tok);
// If -M or -MD are given, print file dependencies.
if (opt_M || opt_MD) {
print_dependencies();
if (opt_M) return;
}
// If -E is given, print out preprocessed C code as a result.
if (opt_E) {
print_tokens(tok);
return;
}
Obj *prog = parse(tok);
// Traverse the AST to emit assembly.
FILE *out = open_file(output_file);
codegen(prog, out);
fclose(out);
}
static void assemble(char *input, char *output) {
char *cmd[] = {"as", "-W", "-I.", "-c", input, "-o", output, NULL};
run_subprocess(cmd);
}
static void run_linker(StringArray *inputs, char *output) {
StringArray arr = {};
strarray_push(&arr, "ld");
strarray_push(&arr, "-o");
strarray_push(&arr, output);
strarray_push(&arr, "-m");
strarray_push(&arr, "elf_x86_64");
if (opt_shared) {
strarray_push(&arr, "/usr/lib/x86_64-linux-gnu/crti.o");
strarray_push(&arr, "/usr/lib/gcc/x86_64-linux-gnu/9/crtbeginS.o");
} else {
strarray_push(&arr, "/usr/lib/x86_64-linux-gnu/crt1.o");
strarray_push(&arr, "/usr/lib/x86_64-linux-gnu/crti.o");
strarray_push(&arr, "/usr/lib/gcc/x86_64-linux-gnu/9/crtbegin.o");
}
strarray_push(&arr, "-L/usr/lib/gcc/x86_64-linux-gnu/9");
strarray_push(&arr, "-L/usr/lib/x86_64-linux-gnu");
strarray_push(&arr, "-L/usr/lib64");
strarray_push(&arr, "-L/lib/x86_64-linux-gnu");
strarray_push(&arr, "-L/lib64");
strarray_push(&arr, "-L/usr/lib/x86_64-linux-gnu");
strarray_push(&arr, "-L/usr/lib");
strarray_push(&arr, "-L/lib");
if (!opt_static) {
strarray_push(&arr, "-dynamic-linker");
strarray_push(&arr, "/lib64/ld-linux-x86-64.so.2");
}
for (int i = 0; i < ld_extra_args.len; i++)
strarray_push(&arr, ld_extra_args.data[i]);
for (int i = 0; i < inputs->len; i++) strarray_push(&arr, inputs->data[i]);
if (opt_static) {
strarray_push(&arr, "--start-group");
strarray_push(&arr, "-lgcc");
strarray_push(&arr, "-lgcc_eh");
strarray_push(&arr, "-lc");
strarray_push(&arr, "--end-group");
} else {
strarray_push(&arr, "-lc");
strarray_push(&arr, "-lgcc");
strarray_push(&arr, "--as-needed");
strarray_push(&arr, "-lgcc_s");
strarray_push(&arr, "--no-as-needed");
}
if (opt_shared)
strarray_push(&arr, "/usr/lib/gcc/x86_64-linux-gnu/9/crtendS.o");
else
strarray_push(&arr, "/usr/lib/gcc/x86_64-linux-gnu/9/crtend.o");
strarray_push(&arr, "/usr/lib/x86_64-linux-gnu/crtn.o");
strarray_push(&arr, NULL);
run_subprocess(arr.data);
}
static FileType get_file_type(char *filename) {
if (opt_x != FILE_NONE) return opt_x;
if (ends_with(filename, ".a")) return FILE_AR;
if (ends_with(filename, ".so")) return FILE_DSO;
if (ends_with(filename, ".o")) return FILE_OBJ;
if (ends_with(filename, ".c")) return FILE_C;
if (ends_with(filename, ".s")) return FILE_ASM;
error("<command line>: unknown file extension: %s", filename);
}
int main(int argc, char **argv) {
atexit(cleanup);
init_macros();
parse_args(argc, argv);
if (opt_cc1) {
add_default_include_paths(argv[0]);
cc1();
return 0;
}
if (input_paths.len > 1 && opt_o && (opt_c || opt_S | opt_E))
error("cannot specify '-o' with '-c,' '-S' or '-E' with multiple files");
StringArray ld_args = {};
for (int i = 0; i < input_paths.len; i++) {
char *input = input_paths.data[i];
if (!strncmp(input, "-l", 2)) {
strarray_push(&ld_args, input);
continue;
}
if (!strncmp(input, "-Wl,", 4)) {
char *s = strdup(input + 4);
char *arg = strtok(s, ",");
while (arg) {
strarray_push(&ld_args, arg);
arg = strtok(NULL, ",");
}
continue;
}
char *output;
if (opt_o)
output = opt_o;
else if (opt_S)
output = replace_extn(input, ".s");
else
output = replace_extn(input, ".o");
FileType type = get_file_type(input);
// Handle .o or .a
if (type == FILE_OBJ || type == FILE_AR || type == FILE_DSO) {
strarray_push(&ld_args, input);
continue;
}
// Handle .s
if (type == FILE_ASM) {
if (!opt_S) assemble(input, output);
continue;
}
assert(type == FILE_C);
// Just preprocess
if (opt_E || opt_M) {
run_cc1(argc, argv, input, NULL);
continue;
}
// Compile
if (opt_S) {
run_cc1(argc, argv, input, output);
continue;
}
// Compile and assemble
if (opt_c) {
char *tmp = create_tmpfile();
run_cc1(argc, argv, input, tmp);
assemble(tmp, output);
continue;
}
// Compile, assemble and link
char *tmp1 = create_tmpfile();
char *tmp2 = create_tmpfile();
run_cc1(argc, argv, input, tmp1);
assemble(tmp1, tmp2);
strarray_push(&ld_args, tmp2);
continue;
}
if (ld_args.len > 0) run_linker(&ld_args, opt_o ? opt_o : "a.out");
return 0;
}

474
third_party/chibicc/chibicc.h vendored Normal file
View file

@ -0,0 +1,474 @@
#ifndef COSMOPOLITAN_THIRD_PARTY_CHIBICC_CHIBICC_H_
#define COSMOPOLITAN_THIRD_PARTY_CHIBICC_CHIBICC_H_
#if !(__ASSEMBLER__ + __LINKER__ + 0)
COSMOPOLITAN_C_START_
#define _POSIX_C_SOURCE 200809L
#include "libc/assert.h"
#include "libc/bits/popcnt.h"
#include "libc/calls/calls.h"
#include "libc/calls/struct/stat.h"
#include "libc/calls/weirdtypes.h"
#include "libc/conv/conv.h"
#include "libc/errno.h"
#include "libc/fmt/fmt.h"
#include "libc/log/log.h"
#include "libc/macros.h"
#include "libc/mem/mem.h"
#include "libc/runtime/runtime.h"
#include "libc/stdio/stdio.h"
#include "libc/stdio/temp.h"
#include "libc/str/str.h"
#include "libc/time/struct/tm.h"
#include "libc/time/time.h"
#include "libc/unicode/unicode.h"
#include "libc/x/x.h"
#include "third_party/gdtoa/gdtoa.h"
#pragma GCC diagnostic ignored "-Wswitch"
#ifndef __GNUC__
#define __attribute__(x)
#endif
typedef struct Type Type;
typedef struct Node Node;
typedef struct Member Member;
typedef struct Relocation Relocation;
typedef struct Hideset Hideset;
//
// strarray.c
//
typedef struct {
char **data;
int capacity;
int len;
} StringArray;
void strarray_push(StringArray *arr, char *s);
//
// tokenize.c
//
// Token
typedef enum {
TK_RESERVED, // Keywords or punctuators
TK_IDENT, // Identifiers
TK_STR, // String literals
TK_NUM, // Numeric literals
TK_PP_NUM, // Preprocessing numbers
TK_EOF, // End-of-file markers
} TokenKind;
typedef struct {
char *name;
int file_no;
char *contents;
// For #line directive
char *display_name;
int line_delta;
} File;
// Token type
typedef struct Token Token;
struct Token {
TokenKind kind; // Token kind
Token *next; // Next token
int64_t val; // If kind is TK_NUM, its value
long double fval; // If kind is TK_NUM, its value
char *loc; // Token location
int len; // Token length
Type *ty; // Used if TK_NUM or TK_STR
char *str; // String literal contents including terminating '\0'
File *file; // Source location
char *filename; // Filename
int line_no; // Line number
int line_delta; // Line number
bool at_bol; // True if this token is at beginning of line
bool has_space; // True if this token follows a space character
Hideset *hideset; // For macro expansion
Token *origin; // If this is expanded from a macro, the original token
};
noreturn void error(char *fmt, ...) __attribute__((format(printf, 1, 2)));
noreturn void error_at(char *loc, char *fmt, ...)
__attribute__((format(printf, 2, 3)));
noreturn void error_tok(Token *tok, char *fmt, ...)
__attribute__((format(printf, 2, 3)));
void warn_tok(Token *tok, char *fmt, ...) __attribute__((format(printf, 2, 3)));
bool equal(Token *tok, char *op);
Token *skip(Token *tok, char *op);
bool consume(Token **rest, Token *tok, char *str);
void convert_pp_tokens(Token *tok);
File **get_input_files(void);
File *new_file(char *name, int file_no, char *contents);
Token *tokenize_string_literal(Token *tok, Type *basety);
Token *tokenize(File *file);
Token *tokenize_file(char *filename);
#define UNREACHABLE() error("internal error at %s:%d", __FILE__, __LINE__)
//
// preprocess.c
//
char *format(char *fmt, ...);
char *search_include_paths(char *filename);
bool file_exists(char *path);
void init_macros(void);
void define_macro(char *name, char *buf);
void undef_macro(char *name);
Token *preprocess(Token *tok);
//
// parse.c
//
// Variable or function
typedef struct Obj Obj;
struct Obj {
Obj *next;
char *name; // Variable name
Type *ty; // Type
Token *tok; // representative token
bool is_local; // local or global/function
int align; // alignment
// Local variable
int offset;
// Global variable or function
bool is_function;
bool is_definition;
bool is_static;
// Global variable
bool is_tentative;
bool is_tls;
char *init_data;
Relocation *rel;
// Function
bool is_inline;
Obj *params;
Node *body;
Obj *locals;
Obj *va_area;
Obj *alloca_bottom;
int stack_size;
// Static inline function
bool is_live;
bool is_root;
StringArray refs;
};
// Global variable can be initialized either by a constant expression
// or a pointer to another global variable. This struct represents the
// latter.
typedef struct Relocation Relocation;
struct Relocation {
Relocation *next;
int offset;
char **label;
long addend;
};
// AST node
typedef enum {
ND_NULL_EXPR, // Do nothing
ND_ADD, // +
ND_SUB, // -
ND_MUL, // *
ND_DIV, // /
ND_NEG, // unary -
ND_MOD, // %
ND_BITAND, // &
ND_BITOR, // |
ND_BITXOR, // ^
ND_SHL, // <<
ND_SHR, // >>
ND_EQ, // ==
ND_NE, // !=
ND_LT, // <
ND_LE, // <=
ND_ASSIGN, // =
ND_COND, // ?:
ND_COMMA, // ,
ND_MEMBER, // . (struct member access)
ND_ADDR, // unary &
ND_DEREF, // unary *
ND_NOT, // !
ND_BITNOT, // ~
ND_LOGAND, // &&
ND_LOGOR, // ||
ND_RETURN, // "return"
ND_IF, // "if"
ND_FOR, // "for" or "while"
ND_DO, // "do"
ND_SWITCH, // "switch"
ND_CASE, // "case"
ND_BLOCK, // { ... }
ND_GOTO, // "goto"
ND_GOTO_EXPR, // "goto" labels-as-values
ND_LABEL, // Labeled statement
ND_LABEL_VAL, // [GNU] Labels-as-values
ND_FUNCALL, // Function call
ND_EXPR_STMT, // Expression statement
ND_STMT_EXPR, // Statement expression
ND_VAR, // Variable
ND_VLA_PTR, // VLA designator
ND_NUM, // Integer
ND_CAST, // Type cast
ND_MEMZERO, // Zero-clear a stack variable
ND_ASM, // "asm"
ND_CAS, // Atomic compare-and-swap
ND_EXCH, // Atomic exchange
} NodeKind;
// AST node type
struct Node {
NodeKind kind; // Node kind
Node *next; // Next node
Type *ty; // Type, e.g. int or pointer to int
Token *tok; // Representative token
Node *lhs; // Left-hand side
Node *rhs; // Right-hand side
// "if" or "for" statement
Node *cond;
Node *then;
Node *els;
Node *init;
Node *inc;
// "break" and "continue" labels
char *brk_label;
char *cont_label;
// Block or statement expression
Node *body;
// Struct member access
Member *member;
// Function call
Type *func_ty;
Node *args;
bool pass_by_stack;
Obj *ret_buffer;
// Goto or labeled statement, or labels-as-values
char *label;
char *unique_label;
Node *goto_next;
// Switch
Node *case_next;
Node *default_case;
// Case
long begin;
long end;
// "asm" string literal
char *asm_str;
// Atomic compare-and-swap
Node *cas_addr;
Node *cas_old;
Node *cas_new;
// Atomic op= operators
Obj *atomic_addr;
Node *atomic_expr;
// Variable
Obj *var;
// Numeric literal
int64_t val;
long double fval;
};
Node *new_cast(Node *expr, Type *ty);
int64_t const_expr(Token **rest, Token *tok);
Obj *parse(Token *tok);
//
// type.c
//
typedef enum {
TY_VOID,
TY_BOOL,
TY_CHAR,
TY_SHORT,
TY_INT,
TY_LONG,
TY_FLOAT,
TY_DOUBLE,
TY_LDOUBLE,
TY_ENUM,
TY_PTR,
TY_FUNC,
TY_ARRAY,
TY_VLA, // variable-length array
TY_STRUCT,
TY_UNION,
} TypeKind;
struct Type {
TypeKind kind;
int size; // sizeof() value
int align; // alignment
bool is_unsigned; // unsigned or signed
bool is_atomic; // true if _Atomic
Type *origin; // for type compatibility check
// Pointer-to or array-of type. We intentionally use the same member
// to represent pointer/array duality in C.
//
// In many contexts in which a pointer is expected, we examine this
// member instead of "kind" member to determine whether a type is a
// pointer or not. That means in many contexts "array of T" is
// naturally handled as if it were "pointer to T", as required by
// the C spec.
Type *base;
// Declaration
Token *name;
Token *name_pos;
// Array
int array_len;
// Variable-length array
Node *vla_len; // # of elements
Obj *vla_size; // sizeof() value
// Struct
Member *members;
bool is_flexible;
bool is_packed;
// Function type
Type *return_ty;
Type *params;
bool is_variadic;
Type *next;
};
// Struct member
struct Member {
Member *next;
Type *ty;
Token *tok; // for error message
Token *name;
int idx;
int align;
int offset;
// Bitfield
bool is_bitfield;
int bit_offset;
int bit_width;
};
extern Type *ty_void;
extern Type *ty_bool;
extern Type *ty_char;
extern Type *ty_short;
extern Type *ty_int;
extern Type *ty_long;
extern Type *ty_uchar;
extern Type *ty_ushort;
extern Type *ty_uint;
extern Type *ty_ulong;
extern Type *ty_float;
extern Type *ty_double;
extern Type *ty_ldouble;
bool is_integer(Type *ty);
bool is_flonum(Type *ty);
bool is_numeric(Type *ty);
bool is_compatible(Type *t1, Type *t2);
Type *copy_type(Type *ty);
Type *pointer_to(Type *base);
Type *func_type(Type *return_ty);
Type *array_of(Type *base, int size);
Type *vla_of(Type *base, Node *expr);
Type *enum_type(void);
Type *struct_type(void);
void add_type(Node *node);
//
// codegen.c
//
void codegen(Obj *prog, FILE *out);
int align_to(int n, int align);
//
// unicode.c
//
int encode_utf8(char *buf, uint32_t c);
uint32_t decode_utf8(char **new_pos, char *p);
bool is_ident1(uint32_t c);
bool is_ident2(uint32_t c);
int str_width(char *p, int len);
//
// hashmap.c
//
typedef struct {
char *key;
int keylen;
void *val;
} HashEntry;
typedef struct {
HashEntry *buckets;
int capacity;
int used;
} HashMap;
void *hashmap_get(HashMap *map, char *key);
void *hashmap_get2(HashMap *map, char *key, int keylen);
void hashmap_put(HashMap *map, char *key, void *val);
void hashmap_put2(HashMap *map, char *key, int keylen, void *val);
void hashmap_delete(HashMap *map, char *key);
void hashmap_delete2(HashMap *map, char *key, int keylen);
void hashmap_test(void);
//
// main.c
//
extern StringArray include_paths;
extern bool opt_fpic;
extern bool opt_fcommon;
extern char *base_file;
typedef struct StaticAsm {
struct StaticAsm *next;
Node *body;
} StaticAsm;
extern struct StaticAsm *staticasms;
COSMOPOLITAN_C_END_
#endif /* !(__ASSEMBLER__ + __LINKER__ + 0) */
#endif /* COSMOPOLITAN_THIRD_PARTY_CHIBICC_CHIBICC_H_ */

78
third_party/chibicc/chibicc.mk vendored Normal file
View file

@ -0,0 +1,78 @@
#-*-mode:makefile-gmake;indent-tabs-mode:t;tab-width:8;coding:utf-8-*-┐
#───vi: set et ft=make ts=8 tw=8 fenc=utf-8 :vi───────────────────────┘
PKGS += THIRD_PARTY_CHIBICC
THIRD_PARTY_CHIBICC_ARTIFACTS += THIRD_PARTY_CHIBICC_A
THIRD_PARTY_CHIBICC = $(THIRD_PARTY_CHIBICC_A_DEPS) $(THIRD_PARTY_CHIBICC_A)
THIRD_PARTY_CHIBICC_A = o/$(MODE)/third_party/chibicc/chibicc.a
THIRD_PARTY_CHIBICC_A_FILES := $(wildcard third_party/chibicc/*)
THIRD_PARTY_CHIBICC_A_HDRS = $(filter %.h,$(THIRD_PARTY_CHIBICC_A_FILES))
THIRD_PARTY_CHIBICC_A_SRCS_S = $(filter %.S,$(THIRD_PARTY_CHIBICC_A_FILES))
THIRD_PARTY_CHIBICC_A_SRCS_C = $(filter %.c,$(THIRD_PARTY_CHIBICC_A_FILES))
THIRD_PARTY_CHIBICC_BINS = \
o/$(MODE)/third_party/chibicc/chibicc.com
THIRD_PARTY_CHIBICC_A_SRCS = \
$(THIRD_PARTY_CHIBICC_A_SRCS_S) \
$(THIRD_PARTY_CHIBICC_A_SRCS_C)
THIRD_PARTY_CHIBICC_A_OBJS = \
$(THIRD_PARTY_CHIBICC_A_SRCS:%=o/$(MODE)/%.zip.o) \
$(THIRD_PARTY_CHIBICC_A_SRCS_S:%.S=o/$(MODE)/%.o) \
$(THIRD_PARTY_CHIBICC_A_SRCS_C:%.c=o/$(MODE)/%.o)
THIRD_PARTY_CHIBICC_A_CHECKS = \
$(THIRD_PARTY_CHIBICC_A).pkg \
$(THIRD_PARTY_CHIBICC_A_HDRS:%=o/$(MODE)/%.ok)
THIRD_PARTY_CHIBICC_A_DIRECTDEPS = \
LIBC_STR \
LIBC_STUBS \
LIBC_FMT \
LIBC_NEXGEN32E \
LIBC_UNICODE \
LIBC_STDIO \
LIBC_MEM \
LIBC_LOG \
LIBC_CALLS \
LIBC_CALLS_HEFTY \
LIBC_TIME \
LIBC_X \
LIBC_CONV \
LIBC_RUNTIME \
THIRD_PARTY_GDTOA
THIRD_PARTY_CHIBICC_A_DEPS := \
$(call uniq,$(foreach x,$(THIRD_PARTY_CHIBICC_A_DIRECTDEPS),$($(x))))
$(THIRD_PARTY_CHIBICC_A): \
third_party/chibicc/ \
$(THIRD_PARTY_CHIBICC_A).pkg \
$(THIRD_PARTY_CHIBICC_A_OBJS)
$(THIRD_PARTY_CHIBICC_A).pkg: \
$(THIRD_PARTY_CHIBICC_A_OBJS) \
$(foreach x,$(THIRD_PARTY_CHIBICC_A_DIRECTDEPS),$($(x)_A).pkg)
o/$(MODE)/third_party/chibicc/%.com.dbg: \
$(THIRD_PARTY_CHIBICC_A_DEPS) \
$(THIRD_PARTY_CHIBICC_A) \
o/$(MODE)/third_party/chibicc/%.o \
$(THIRD_PARTY_CHIBICC_A).pkg \
$(CRT) \
$(APE)
@$(APELINK)
THIRD_PARTY_CHIBICC_LIBS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)))
THIRD_PARTY_CHIBICC_SRCS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_SRCS))
THIRD_PARTY_CHIBICC_HDRS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_HDRS))
THIRD_PARTY_CHIBICC_CHECKS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_CHECKS))
THIRD_PARTY_CHIBICC_OBJS = $(foreach x,$(THIRD_PARTY_CHIBICC_ARTIFACTS),$($(x)_OBJS))
$(THIRD_PARTY_CHIBICC_OBJS): $(BUILD_FILES) third_party/chibicc/chibicc.mk
.PHONY: o/$(MODE)/third_party/chibicc
o/$(MODE)/third_party/chibicc: \
$(THIRD_PARTY_CHIBICC_BINS) \
$(THIRD_PARTY_CHIBICC_CHECKS)

1590
third_party/chibicc/codegen.c vendored Normal file

File diff suppressed because it is too large Load diff

130
third_party/chibicc/hashmap.c vendored Normal file
View file

@ -0,0 +1,130 @@
// This is an implementation of the open-addressing hash table.
#include "third_party/chibicc/chibicc.h"
#define TOMBSTONE ((void *)-1) // Represents a deleted hash entry
static uint64_t fnv_hash(char *s, int len) {
uint64_t hash = 0xcbf29ce484222325;
for (int i = 0; i < len; i++) {
hash *= 0x100000001b3;
hash ^= (unsigned char)s[i];
}
return hash;
}
// Make room for new entires in a given hashmap by removing
// tombstones and possibly extending the bucket size.
static void rehash(HashMap *map) {
// Compute the size of the new hashmap.
int nkeys = 0;
for (int i = 0; i < map->capacity; i++)
if (map->buckets[i].key && map->buckets[i].key != TOMBSTONE) nkeys++;
int cap = map->capacity;
while ((nkeys * 100) / cap >= 50) cap = cap * 2;
// Create a new hashmap and copy all key-values.
HashMap map2 = {};
map2.buckets = calloc(cap, sizeof(HashEntry));
map2.capacity = cap;
for (int i = 0; i < map->capacity; i++) {
HashEntry *ent = &map->buckets[i];
if (ent->key && ent->key != TOMBSTONE)
hashmap_put2(&map2, ent->key, ent->keylen, ent->val);
}
assert(map2.used == nkeys);
*map = map2;
}
static bool match(HashEntry *ent, char *key, int keylen) {
return ent->key && ent->key != TOMBSTONE && ent->keylen == keylen &&
memcmp(ent->key, key, keylen) == 0;
}
static HashEntry *get_entry(HashMap *map, char *key, int keylen) {
if (!map->buckets) return NULL;
uint64_t hash = fnv_hash(key, keylen);
for (int i = 0; i < map->capacity; i++) {
HashEntry *ent = &map->buckets[(hash + i) % map->capacity];
if (match(ent, key, keylen)) return ent;
if (ent->key == NULL) return NULL;
}
UNREACHABLE();
}
static HashEntry *get_or_insert_entry(HashMap *map, char *key, int keylen) {
if (!map->buckets) {
map->buckets = calloc((map->capacity = 16), sizeof(HashEntry));
}
if ((map->used * 100) / map->capacity >= 70) rehash(map);
uint64_t hash = fnv_hash(key, keylen);
for (int i = 0; i < map->capacity; i++) {
HashEntry *ent = &map->buckets[(hash + i) % map->capacity];
if (match(ent, key, keylen)) return ent;
if (ent->key == TOMBSTONE) {
ent->key = key;
ent->keylen = keylen;
return ent;
}
if (ent->key == NULL) {
ent->key = key;
ent->keylen = keylen;
map->used++;
return ent;
}
}
UNREACHABLE();
}
void *hashmap_get(HashMap *map, char *key) {
return hashmap_get2(map, key, strlen(key));
}
void *hashmap_get2(HashMap *map, char *key, int keylen) {
HashEntry *ent = get_entry(map, key, keylen);
return ent ? ent->val : NULL;
}
void hashmap_put(HashMap *map, char *key, void *val) {
hashmap_put2(map, key, strlen(key), val);
}
void hashmap_put2(HashMap *map, char *key, int keylen, void *val) {
HashEntry *ent = get_or_insert_entry(map, key, keylen);
ent->val = val;
}
void hashmap_delete(HashMap *map, char *key) {
hashmap_delete2(map, key, strlen(key));
}
void hashmap_delete2(HashMap *map, char *key, int keylen) {
HashEntry *ent = get_entry(map, key, keylen);
if (ent) ent->key = TOMBSTONE;
}
void hashmap_test(void) {
HashMap *map = calloc(1, sizeof(HashMap));
for (int i = 0; i < 5000; i++)
hashmap_put(map, format("key %d", i), (void *)(size_t)i);
for (int i = 1000; i < 2000; i++) hashmap_delete(map, format("key %d", i));
for (int i = 1500; i < 1600; i++)
hashmap_put(map, format("key %d", i), (void *)(size_t)i);
for (int i = 6000; i < 7000; i++)
hashmap_put(map, format("key %d", i), (void *)(size_t)i);
for (int i = 0; i < 1000; i++)
assert((size_t)hashmap_get(map, format("key %d", i)) == i);
for (int i = 1000; i < 1500; i++)
assert(hashmap_get(map, "no such key") == NULL);
for (int i = 1500; i < 1600; i++)
assert((size_t)hashmap_get(map, format("key %d", i)) == i);
for (int i = 1600; i < 2000; i++)
assert(hashmap_get(map, "no such key") == NULL);
for (int i = 2000; i < 5000; i++)
assert((size_t)hashmap_get(map, format("key %d", i)) == i);
for (int i = 5000; i < 6000; i++)
assert(hashmap_get(map, "no such key") == NULL);
for (int i = 6000; i < 7000; i++)
hashmap_put(map, format("key %d", i), (void *)(size_t)i);
assert(hashmap_get(map, "no such key") == NULL);
printf("OK\n");
}

3301
third_party/chibicc/parse.c vendored Normal file

File diff suppressed because it is too large Load diff

1099
third_party/chibicc/preprocess.c vendored Normal file

File diff suppressed because it is too large Load diff

16
third_party/chibicc/strarray.c vendored Normal file
View file

@ -0,0 +1,16 @@
#include "third_party/chibicc/chibicc.h"
void strarray_push(StringArray *arr, char *s) {
if (!arr->data) {
arr->data = calloc(8, sizeof(char *));
arr->capacity = 8;
}
if (arr->capacity == arr->len) {
arr->data = realloc(arr->data, sizeof(char *) * arr->capacity * 2);
arr->capacity *= 2;
for (int i = arr->len; i < arr->capacity; i++) arr->data[i] = NULL;
}
arr->data[arr->len++] = s;
}

785
third_party/chibicc/tokenize.c vendored Normal file
View file

@ -0,0 +1,785 @@
#include "third_party/chibicc/chibicc.h"
// Input file
static File *current_file;
// A list of all input files.
static File **input_files;
// True if the current position is at the beginning of a line
static bool at_bol;
// True if the current position follows a space character
static bool has_space;
// Reports an error and exit.
void error(char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
exit(1);
}
// Reports an error message in the following format.
//
// foo.c:10: x = y + 1;
// ^ <error message here>
static void verror_at(char *filename, char *input, int line_no, char *loc,
char *fmt, va_list ap) {
// Find a line containing `loc`.
char *line = loc;
while (input < line && line[-1] != '\n') line--;
char *end = loc;
while (*end && *end != '\n') end++;
// Print out the line.
int indent = fprintf(stderr, "%s:%d: ", filename, line_no);
fprintf(stderr, "%.*s\n", (int)(end - line), line);
// Show the error message.
int pos = str_width(line, loc - line) + indent;
fprintf(stderr, "%*s", pos, ""); // print pos spaces.
fprintf(stderr, "^ ");
vfprintf(stderr, fmt, ap);
fprintf(stderr, "\n");
}
void error_at(char *loc, char *fmt, ...) {
int line_no = 1;
for (char *p = current_file->contents; p < loc; p++)
if (*p == '\n') line_no++;
va_list ap;
va_start(ap, fmt);
verror_at(current_file->name, current_file->contents, line_no, loc, fmt, ap);
exit(1);
}
void error_tok(Token *tok, char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
verror_at(tok->file->name, tok->file->contents, tok->line_no, tok->loc, fmt,
ap);
exit(1);
}
void warn_tok(Token *tok, char *fmt, ...) {
va_list ap;
va_start(ap, fmt);
verror_at(tok->file->name, tok->file->contents, tok->line_no, tok->loc, fmt,
ap);
}
// Consumes the current token if it matches `op`.
bool equal(Token *tok, char *op) {
return strlen(op) == tok->len && !strncmp(tok->loc, op, tok->len);
}
// Ensure that the current token is `op`.
Token *skip(Token *tok, char *op) {
if (!equal(tok, op)) error_tok(tok, "expected '%s'", op);
return tok->next;
}
bool consume(Token **rest, Token *tok, char *str) {
if (equal(tok, str)) {
*rest = tok->next;
return true;
}
*rest = tok;
return false;
}
// Create a new token and add it as the next token of `cur`.
static Token *new_token(TokenKind kind, char *start, char *end) {
Token *tok = calloc(1, sizeof(Token));
tok->kind = kind;
tok->loc = start;
tok->len = end - start;
tok->file = current_file;
tok->filename = current_file->display_name;
tok->at_bol = at_bol;
tok->has_space = has_space;
at_bol = has_space = false;
return tok;
}
static bool starts_with(char *p, char *q) {
return strncmp(p, q, strlen(q)) == 0;
}
// Read an identifier and returns a pointer pointing to the end
// of an identifier.
//
// Returns null if p does not point to a valid identifier.
static char *read_ident(char *p) {
uint32_t c = decode_utf8(&p, p);
if (!is_ident1(c)) return NULL;
for (;;) {
char *q;
c = decode_utf8(&q, p);
if (!is_ident2(c)) return p;
p = q;
}
}
static int from_hex(char c) {
if ('0' <= c && c <= '9') return c - '0';
if ('a' <= c && c <= 'f') return c - 'a' + 10;
return c - 'A' + 10;
}
static bool is_keyword(Token *tok) {
static HashMap map;
if (map.capacity == 0) {
static char *kw[] = {
"return", "if", "else",
"for", "while", "int",
"sizeof", "char", "struct",
"union", "short", "long",
"void", "typedef", "_Bool",
"enum", "static", "goto",
"break", "continue", "switch",
"case", "default", "extern",
"_Alignof", "_Alignas", "do",
"signed", "unsigned", "const",
"volatile", "auto", "register",
"restrict", "__restrict", "__restrict__",
"_Noreturn", "float", "double",
"typeof", "asm", "_Thread_local",
"__thread", "_Atomic", "__attribute__",
};
for (int i = 0; i < sizeof(kw) / sizeof(*kw); i++)
hashmap_put(&map, kw[i], (void *)1);
}
return hashmap_get2(&map, tok->loc, tok->len);
}
static int read_escaped_char(char **new_pos, char *p) {
if ('0' <= *p && *p <= '7') {
// Read an octal number.
int c = *p++ - '0';
if ('0' <= *p && *p <= '7') {
c = (c << 3) + (*p++ - '0');
if ('0' <= *p && *p <= '7') c = (c << 3) + (*p++ - '0');
}
*new_pos = p;
return c;
}
if (*p == 'x') {
// Read a hexadecimal number.
p++;
if (!isxdigit(*p)) error_at(p, "invalid hex escape sequence");
int c = 0;
for (; isxdigit(*p); p++) c = (c << 4) + from_hex(*p);
*new_pos = p;
return c;
}
*new_pos = p + 1;
switch (*p) {
case 'a':
return '\a';
case 'b':
return '\b';
case 't':
return '\t';
case 'n':
return '\n';
case 'v':
return '\v';
case 'f':
return '\f';
case 'r':
return '\r';
// [GNU] \e for the ASCII escape character is a GNU C extension.
case 'e':
return 27;
default:
return *p;
}
}
// Find a closing double-quote.
static char *string_literal_end(char *p) {
char *start = p;
for (; *p != '"'; p++) {
if (*p == '\0') error_at(start, "unclosed string literal");
if (*p == '\\') p++;
}
return p;
}
static Token *read_string_literal(char *start, char *quote) {
char *end = string_literal_end(quote + 1);
char *buf = calloc(1, end - quote);
int len = 0;
for (char *p = quote + 1; p < end;) {
if (*p == '\\')
buf[len++] = read_escaped_char(&p, p + 1);
else
buf[len++] = *p++;
}
Token *tok = new_token(TK_STR, start, end + 1);
tok->ty = array_of(ty_char, len + 1);
tok->str = buf;
return tok;
}
// Read a UTF-8-encoded string literal and transcode it in UTF-16.
//
// UTF-16 is yet another variable-width encoding for Unicode. Code
// points smaller than U+10000 are encoded in 2 bytes. Code points
// equal to or larger than that are encoded in 4 bytes. Each 2 bytes
// in the 4 byte sequence is called "surrogate", and a 4 byte sequence
// is called a "surrogate pair".
static Token *read_utf16_string_literal(char *start, char *quote) {
char *end = string_literal_end(quote + 1);
uint16_t *buf = calloc(2, end - start - 1);
int len = 0;
for (char *p = quote + 1; p < end;) {
if (*p == '\\') {
buf[len++] = read_escaped_char(&p, p + 1);
continue;
}
uint32_t c = decode_utf8(&p, p);
if (c < 0x10000) {
// Encode a code point in 2 bytes.
buf[len++] = c;
} else {
// Encode a code point in 4 bytes.
c -= 0x10000;
buf[len++] = 0xd800 + ((c >> 10) & 0x3ff);
buf[len++] = 0xdc00 + (c & 0x3ff);
}
}
Token *tok = new_token(TK_STR, start, end + 1);
tok->ty = array_of(ty_ushort, len + 1);
tok->str = (char *)buf;
return tok;
}
// Read a UTF-8-encoded string literal and transcode it in UTF-32.
//
// UTF-32 is a fixed-width encoding for Unicode. Each code point is
// encoded in 4 bytes.
static Token *read_utf32_string_literal(char *start, char *quote, Type *ty) {
char *end = string_literal_end(quote + 1);
uint32_t *buf = calloc(4, end - quote);
int len = 0;
for (char *p = quote + 1; p < end;) {
if (*p == '\\')
buf[len++] = read_escaped_char(&p, p + 1);
else
buf[len++] = decode_utf8(&p, p);
}
Token *tok = new_token(TK_STR, start, end + 1);
tok->ty = array_of(ty, len + 1);
tok->str = (char *)buf;
return tok;
}
static Token *read_char_literal(char *start, char *quote, Type *ty) {
char *p = quote + 1;
if (*p == '\0') error_at(start, "unclosed char literal");
int c;
if (*p == '\\')
c = read_escaped_char(&p, p + 1);
else
c = decode_utf8(&p, p);
char *end = strchr(p, '\'');
if (!end) error_at(p, "unclosed char literal");
Token *tok = new_token(TK_NUM, start, end + 1);
tok->val = c;
tok->ty = ty;
return tok;
}
static bool convert_pp_int(Token *tok) {
char *p = tok->loc;
// Read a binary, octal, decimal or hexadecimal number.
int base = 10;
if (!strncasecmp(p, "0x", 2) && isxdigit(p[2])) {
p += 2;
base = 16;
} else if (!strncasecmp(p, "0b", 2) && (p[2] == '0' || p[2] == '1')) {
p += 2;
base = 2;
} else if (*p == '0') {
base = 8;
}
int64_t val = strtoul(p, &p, base);
// Read U, L or LL suffixes.
bool l = false;
bool u = false;
if (starts_with(p, "LLU") || starts_with(p, "LLu") || starts_with(p, "llU") ||
starts_with(p, "llu") || starts_with(p, "ULL") || starts_with(p, "Ull") ||
starts_with(p, "uLL") || starts_with(p, "ull")) {
p += 3;
l = u = true;
} else if (!strncasecmp(p, "lu", 2) || !strncasecmp(p, "ul", 2)) {
p += 2;
l = u = true;
} else if (starts_with(p, "LL") || starts_with(p, "ll")) {
p += 2;
l = true;
} else if (*p == 'L' || *p == 'l') {
p++;
l = true;
} else if (*p == 'U' || *p == 'u') {
p++;
u = true;
}
if (p != tok->loc + tok->len) return false;
// Infer a type.
Type *ty;
if (base == 10) {
if (l && u)
ty = ty_ulong;
else if (l)
ty = ty_long;
else if (u)
ty = (val >> 32) ? ty_ulong : ty_uint;
else
ty = (val >> 31) ? ty_long : ty_int;
} else {
if (l && u)
ty = ty_ulong;
else if (l)
ty = (val >> 63) ? ty_ulong : ty_long;
else if (u)
ty = (val >> 32) ? ty_ulong : ty_uint;
else if (val >> 63)
ty = ty_ulong;
else if (val >> 32)
ty = ty_long;
else if (val >> 31)
ty = ty_uint;
else
ty = ty_int;
}
tok->kind = TK_NUM;
tok->val = val;
tok->ty = ty;
return true;
}
// The definition of the numeric literal at the preprocessing stage
// is more relaxed than the definition of that at the later stages.
// In order to handle that, a numeric literal is tokenized as a
// "pp-number" token first and then converted to a regular number
// token after preprocessing.
//
// This function converts a pp-number token to a regular number token.
static void convert_pp_number(Token *tok) {
// Try to parse as an integer constant.
if (convert_pp_int(tok)) return;
// If it's not an integer, it must be a floating point constant.
char *end;
long double val = strtold(tok->loc, &end);
Type *ty;
if (*end == 'f' || *end == 'F') {
ty = ty_float;
end++;
} else if (*end == 'l' || *end == 'L') {
ty = ty_ldouble;
end++;
} else {
ty = ty_double;
}
if (tok->loc + tok->len != end) error_tok(tok, "invalid numeric constant");
tok->kind = TK_NUM;
tok->fval = val;
tok->ty = ty;
}
void convert_pp_tokens(Token *tok) {
for (Token *t = tok; t->kind != TK_EOF; t = t->next) {
if (is_keyword(t))
t->kind = TK_RESERVED;
else if (t->kind == TK_PP_NUM)
convert_pp_number(t);
}
}
// Initialize line info for all tokens.
static void add_line_numbers(Token *tok) {
char *p = current_file->contents;
int n = 1;
do {
if (p == tok->loc) {
tok->line_no = n;
tok = tok->next;
}
if (*p == '\n') n++;
} while (*p++);
}
Token *tokenize_string_literal(Token *tok, Type *basety) {
Token *t;
if (basety->size == 2)
t = read_utf16_string_literal(tok->loc, tok->loc);
else
t = read_utf32_string_literal(tok->loc, tok->loc, basety);
t->next = tok->next;
return t;
}
// Tokenize a given string and returns new tokens.
Token *tokenize(File *file) {
current_file = file;
char *p = file->contents;
Token head = {};
Token *cur = &head;
at_bol = true;
has_space = false;
while (*p) {
// Skip line comments.
if (starts_with(p, "//")) {
p += 2;
while (*p != '\n') p++;
has_space = true;
continue;
}
// Skip block comments.
if (starts_with(p, "/*")) {
char *q = strstr(p + 2, "*/");
if (!q) error_at(p, "unclosed block comment");
p = q + 2;
has_space = true;
continue;
}
// Skip newline.
if (*p == '\n') {
p++;
at_bol = true;
has_space = false;
continue;
}
// Skip whitespace characters.
if (isspace(*p)) {
p++;
has_space = true;
continue;
}
// Numeric literal
if (isdigit(*p) || (*p == '.' && isdigit(p[1]))) {
char *q = p++;
for (;;) {
if (p[0] && p[1] && strchr("eEpP", p[0]) && strchr("+-", p[1]))
p += 2;
else if (isalnum(*p) || *p == '.')
p++;
else
break;
}
cur = cur->next = new_token(TK_PP_NUM, q, p);
continue;
}
// String literal
if (*p == '"') {
cur = cur->next = read_string_literal(p, p);
p += cur->len;
continue;
}
// UTF-8 string literal
if (starts_with(p, "u8\"")) {
cur = cur->next = read_string_literal(p, p + 2);
p += cur->len;
continue;
}
// UTF-16 string literal
if (starts_with(p, "u\"")) {
cur = cur->next = read_utf16_string_literal(p, p + 1);
p += cur->len;
continue;
}
// Wide string literal
if (starts_with(p, "L\"")) {
cur = cur->next = read_utf32_string_literal(p, p + 1, ty_int);
p += cur->len;
continue;
}
// UTF-32 string literal
if (starts_with(p, "U\"")) {
cur = cur->next = read_utf32_string_literal(p, p + 1, ty_uint);
p += cur->len;
continue;
}
// Character literal
if (*p == '\'') {
cur = cur->next = read_char_literal(p, p, ty_int);
cur->val = (char)cur->val;
p += cur->len;
continue;
}
// UTF-16 character literal
if (starts_with(p, "u'")) {
cur = cur->next = read_char_literal(p, p + 1, ty_ushort);
cur->val &= 0xffff;
p += cur->len;
continue;
}
// Wide character literal
if (starts_with(p, "L'")) {
cur = cur->next = read_char_literal(p, p + 1, ty_int);
p += cur->len;
continue;
}
// UTF-32 character literal
if (starts_with(p, "U'")) {
cur = cur->next = read_char_literal(p, p + 1, ty_uint);
p += cur->len;
continue;
}
// Identifier or keyword
char *q;
if ((q = read_ident(p)) != NULL) {
cur = cur->next = new_token(TK_IDENT, p, q);
p = q;
continue;
}
// Three-letter punctuators
if (starts_with(p, "<<=") || starts_with(p, ">>=") ||
starts_with(p, "...")) {
cur = cur->next = new_token(TK_RESERVED, p, p + 3);
p += 3;
continue;
}
// Two-letter punctuators
if (starts_with(p, "==") || starts_with(p, "!=") || starts_with(p, "<=") ||
starts_with(p, ">=") || starts_with(p, "->") || starts_with(p, "+=") ||
starts_with(p, "-=") || starts_with(p, "*=") || starts_with(p, "/=") ||
starts_with(p, "++") || starts_with(p, "--") || starts_with(p, "%=") ||
starts_with(p, "&=") || starts_with(p, "|=") || starts_with(p, "^=") ||
starts_with(p, "&&") || starts_with(p, "||") || starts_with(p, "<<") ||
starts_with(p, ">>") || starts_with(p, "##")) {
cur = cur->next = new_token(TK_RESERVED, p, p + 2);
p += 2;
continue;
}
// Single-letter punctuators
if (ispunct(*p)) {
cur = cur->next = new_token(TK_RESERVED, p, p + 1);
p++;
continue;
}
error_at(p, "invalid token");
}
cur = cur->next = new_token(TK_EOF, p, p);
add_line_numbers(head.next);
return head.next;
}
// Returns the contents of a given file.
static char *read_file(char *path) {
FILE *fp;
if (strcmp(path, "-") == 0) {
// By convention, read from stdin if a given filename is "-".
fp = stdin;
} else {
fp = fopen(path, "r");
if (!fp) return NULL;
}
int buflen = 4096;
int nread = 0;
char *buf = calloc(1, buflen);
// Read the entire file.
for (;;) {
int end = buflen - 2; // extra 2 bytes for the trailing "\n\0"
int n = fread(buf + nread, 1, end - nread, fp);
if (n == 0) break;
nread += n;
if (nread == end) {
buflen *= 2;
buf = realloc(buf, buflen);
}
}
if (fp != stdin) fclose(fp);
// Make sure that the last logical line is properly terminated with '\n'.
if (nread > 0 && buf[nread - 1] == '\\')
buf[nread - 1] = '\n';
else if (nread == 0 || buf[nread - 1] != '\n')
buf[nread++] = '\n';
buf[nread] = '\0';
return buf;
}
File **get_input_files(void) {
return input_files;
}
File *new_file(char *name, int file_no, char *contents) {
File *file = calloc(1, sizeof(File));
file->name = name;
file->display_name = name;
file->file_no = file_no;
file->contents = contents;
return file;
}
// Replaces \r or \r\n with \n.
static void canonicalize_newline(char *p) {
int i = 0, j = 0;
while (p[i]) {
if (p[i] == '\r' && p[i + 1] == '\n') {
i += 2;
p[j++] = '\n';
} else if (p[i] == '\r') {
i++;
p[j++] = '\n';
} else {
p[j++] = p[i++];
}
}
p[j] = '\0';
}
// Removes backslashes followed by a newline.
static void remove_backslash_newline(char *p) {
int i = 0, j = 0;
// We want to keep the number of newline characters so that
// the logical line number matches the physical one.
// This counter maintain the number of newlines we have removed.
int n = 0;
while (p[i]) {
if (p[i] == '\\' && p[i + 1] == '\n') {
i += 2;
n++;
} else if (p[i] == '\n') {
p[j++] = p[i++];
for (; n > 0; n--) p[j++] = '\n';
} else {
p[j++] = p[i++];
}
}
p[j] = '\0';
}
static uint32_t read_universal_char(char *p, int len) {
uint32_t c = 0;
for (int i = 0; i < len; i++) {
if (!isxdigit(p[i])) return 0;
c = (c << 4) | from_hex(p[i]);
}
return c;
}
// Replace \u or \U escape sequences with corresponding UTF-8 bytes.
static void convert_universal_chars(char *p) {
char *q = p;
while (*p) {
if (starts_with(p, "\\u")) {
uint32_t c = read_universal_char(p + 2, 4);
if (c) {
p += 6;
q += encode_utf8(q, c);
} else {
*q++ = *p++;
}
} else if (starts_with(p, "\\U")) {
uint32_t c = read_universal_char(p + 2, 8);
if (c) {
p += 10;
q += encode_utf8(q, c);
} else {
*q++ = *p++;
}
} else if (p[0] == '\\') {
*q++ = *p++;
*q++ = *p++;
} else {
*q++ = *p++;
}
}
*q = '\0';
}
Token *tokenize_file(char *path) {
char *p = read_file(path);
if (!p) return NULL;
canonicalize_newline(p);
remove_backslash_newline(p);
convert_universal_chars(p);
// Save the filename for assembler .file directive.
static int file_no;
File *file = new_file(path, file_no + 1, p);
// Save the filename for assembler .file directive.
input_files = realloc(input_files, sizeof(char *) * (file_no + 2));
input_files[file_no] = file;
input_files[file_no + 1] = NULL;
file_no++;
return tokenize(file);
}

286
third_party/chibicc/type.c vendored Normal file
View file

@ -0,0 +1,286 @@
#include "third_party/chibicc/chibicc.h"
Type *ty_void = &(Type){TY_VOID, 1, 1};
Type *ty_bool = &(Type){TY_BOOL, 1, 1};
Type *ty_char = &(Type){TY_CHAR, 1, 1};
Type *ty_short = &(Type){TY_SHORT, 2, 2};
Type *ty_int = &(Type){TY_INT, 4, 4};
Type *ty_long = &(Type){TY_LONG, 8, 8};
Type *ty_uchar = &(Type){TY_CHAR, 1, 1, true};
Type *ty_ushort = &(Type){TY_SHORT, 2, 2, true};
Type *ty_uint = &(Type){TY_INT, 4, 4, true};
Type *ty_ulong = &(Type){TY_LONG, 8, 8, true};
Type *ty_float = &(Type){TY_FLOAT, 4, 4};
Type *ty_double = &(Type){TY_DOUBLE, 8, 8};
Type *ty_ldouble = &(Type){TY_LDOUBLE, 16, 16};
static Type *new_type(TypeKind kind, int size, int align) {
Type *ty = calloc(1, sizeof(Type));
ty->kind = kind;
ty->size = size;
ty->align = align;
return ty;
}
bool is_integer(Type *ty) {
TypeKind k = ty->kind;
return k == TY_BOOL || k == TY_CHAR || k == TY_SHORT || k == TY_INT ||
k == TY_LONG || k == TY_ENUM;
}
bool is_flonum(Type *ty) {
return ty->kind == TY_FLOAT || ty->kind == TY_DOUBLE ||
ty->kind == TY_LDOUBLE;
}
bool is_numeric(Type *ty) {
return is_integer(ty) || is_flonum(ty);
}
bool is_compatible(Type *t1, Type *t2) {
if (t1 == t2) return true;
if (t1->origin) return is_compatible(t1->origin, t2);
if (t2->origin) return is_compatible(t1, t2->origin);
if (t1->kind != t2->kind) return false;
switch (t1->kind) {
case TY_CHAR:
case TY_SHORT:
case TY_INT:
case TY_LONG:
return t1->is_unsigned == t2->is_unsigned;
case TY_FLOAT:
case TY_DOUBLE:
case TY_LDOUBLE:
return true;
case TY_PTR:
return is_compatible(t1->base, t2->base);
case TY_FUNC: {
if (!is_compatible(t1->return_ty, t2->return_ty)) return false;
if (t1->is_variadic != t2->is_variadic) return false;
Type *p1 = t1->params;
Type *p2 = t2->params;
for (; p1 && p2; p1 = p1->next, p2 = p2->next)
if (!is_compatible(p1, p2)) return false;
return p1 == NULL && p2 == NULL;
}
case TY_ARRAY:
if (!is_compatible(t1->base, t2->base)) return false;
return t1->array_len < 0 && t2->array_len < 0 &&
t1->array_len == t2->array_len;
}
return false;
}
Type *copy_type(Type *ty) {
Type *ret = calloc(1, sizeof(Type));
*ret = *ty;
ret->origin = ty;
return ret;
}
Type *pointer_to(Type *base) {
Type *ty = new_type(TY_PTR, 8, 8);
ty->base = base;
ty->is_unsigned = true;
return ty;
}
Type *func_type(Type *return_ty) {
// The C spec disallows sizeof(<function type>), but
// GCC allows that and the expression is evaluated to 1.
Type *ty = new_type(TY_FUNC, 1, 1);
ty->return_ty = return_ty;
return ty;
}
Type *array_of(Type *base, int len) {
Type *ty = new_type(TY_ARRAY, base->size * len, base->align);
ty->base = base;
ty->array_len = len;
return ty;
}
Type *vla_of(Type *base, Node *len) {
Type *ty = new_type(TY_VLA, 8, 8);
ty->base = base;
ty->vla_len = len;
return ty;
}
Type *enum_type(void) {
return new_type(TY_ENUM, 4, 4);
}
Type *struct_type(void) {
return new_type(TY_STRUCT, 0, 1);
}
static Type *get_common_type(Type *ty1, Type *ty2) {
if (ty1->base) return pointer_to(ty1->base);
if (ty1->kind == TY_FUNC) return pointer_to(ty1);
if (ty2->kind == TY_FUNC) return pointer_to(ty2);
if (ty1->kind == TY_LDOUBLE || ty2->kind == TY_LDOUBLE) return ty_ldouble;
if (ty1->kind == TY_DOUBLE || ty2->kind == TY_DOUBLE) return ty_double;
if (ty1->kind == TY_FLOAT || ty2->kind == TY_FLOAT) return ty_float;
if (ty1->size < 4) ty1 = ty_int;
if (ty2->size < 4) ty2 = ty_int;
if (ty1->size != ty2->size) return (ty1->size < ty2->size) ? ty2 : ty1;
if (ty2->is_unsigned) return ty2;
return ty1;
}
// For many binary operators, we implicitly promote operands so that
// both operands have the same type. Any integral type smaller than
// int is always promoted to int. If the type of one operand is larger
// than the other's (e.g. "long" vs. "int"), the smaller operand will
// be promoted to match with the other.
//
// This operation is called the "usual arithmetic conversion".
static void usual_arith_conv(Node **lhs, Node **rhs) {
Type *ty = get_common_type((*lhs)->ty, (*rhs)->ty);
*lhs = new_cast(*lhs, ty);
*rhs = new_cast(*rhs, ty);
}
void add_type(Node *node) {
if (!node || node->ty) return;
add_type(node->lhs);
add_type(node->rhs);
add_type(node->cond);
add_type(node->then);
add_type(node->els);
add_type(node->init);
add_type(node->inc);
for (Node *n = node->body; n; n = n->next) add_type(n);
for (Node *n = node->args; n; n = n->next) add_type(n);
switch (node->kind) {
case ND_NUM:
node->ty = ty_int;
return;
case ND_ADD:
case ND_SUB:
case ND_MUL:
case ND_DIV:
case ND_MOD:
case ND_BITAND:
case ND_BITOR:
case ND_BITXOR:
usual_arith_conv(&node->lhs, &node->rhs);
node->ty = node->lhs->ty;
return;
case ND_NEG: {
Type *ty = get_common_type(ty_int, node->lhs->ty);
node->lhs = new_cast(node->lhs, ty);
node->ty = ty;
return;
}
case ND_ASSIGN:
if (node->lhs->ty->kind == TY_ARRAY)
error_tok(node->lhs->tok, "not an lvalue");
if (node->lhs->ty->kind != TY_STRUCT)
node->rhs = new_cast(node->rhs, node->lhs->ty);
node->ty = node->lhs->ty;
return;
case ND_EQ:
case ND_NE:
case ND_LT:
case ND_LE:
usual_arith_conv(&node->lhs, &node->rhs);
node->ty = ty_int;
return;
case ND_FUNCALL:
node->ty = node->func_ty->return_ty;
return;
case ND_NOT:
case ND_LOGOR:
case ND_LOGAND:
node->ty = ty_int;
return;
case ND_BITNOT:
case ND_SHL:
case ND_SHR:
node->ty = node->lhs->ty;
return;
case ND_VAR:
case ND_VLA_PTR:
node->ty = node->var->ty;
return;
case ND_COND:
if (node->then->ty->kind == TY_VOID || node->els->ty->kind == TY_VOID) {
node->ty = ty_void;
} else {
usual_arith_conv(&node->then, &node->els);
node->ty = node->then->ty;
}
return;
case ND_COMMA:
node->ty = node->rhs->ty;
return;
case ND_MEMBER:
node->ty = node->member->ty;
return;
case ND_ADDR: {
Type *ty = node->lhs->ty;
if (ty->kind == TY_ARRAY)
node->ty = pointer_to(ty->base);
else
node->ty = pointer_to(ty);
return;
}
case ND_DEREF:
if (!node->lhs->ty->base)
error_tok(node->tok, "invalid pointer dereference");
if (node->lhs->ty->base->kind == TY_VOID)
error_tok(node->tok, "dereferencing a void pointer");
node->ty = node->lhs->ty->base;
return;
case ND_STMT_EXPR:
if (node->body) {
Node *stmt = node->body;
while (stmt->next) stmt = stmt->next;
if (stmt->kind == ND_EXPR_STMT) {
node->ty = stmt->lhs->ty;
return;
}
}
error_tok(node->tok,
"statement expression returning void is not supported");
return;
case ND_LABEL_VAL:
node->ty = pointer_to(ty_void);
return;
case ND_CAS:
add_type(node->cas_addr);
add_type(node->cas_old);
add_type(node->cas_new);
node->ty = ty_bool;
if (node->cas_addr->ty->kind != TY_PTR)
error_tok(node->cas_addr->tok, "pointer expected");
if (node->cas_old->ty->kind != TY_PTR)
error_tok(node->cas_old->tok, "pointer expected");
return;
case ND_EXCH:
if (node->lhs->ty->kind != TY_PTR)
error_tok(node->cas_addr->tok, "pointer expected");
node->ty = node->lhs->ty->base;
return;
}
}

186
third_party/chibicc/unicode.c vendored Normal file
View file

@ -0,0 +1,186 @@
#include "third_party/chibicc/chibicc.h"
// Encode a given character in UTF-8.
int encode_utf8(char *buf, uint32_t c) {
if (c <= 0x7F) {
buf[0] = c;
return 1;
}
if (c <= 0x7FF) {
buf[0] = 0b11000000 | (c >> 6);
buf[1] = 0b10000000 | (c & 0b00111111);
return 2;
}
if (c <= 0xFFFF) {
buf[0] = 0b11100000 | (c >> 12);
buf[1] = 0b10000000 | ((c >> 6) & 0b00111111);
buf[2] = 0b10000000 | (c & 0b00111111);
return 3;
}
buf[0] = 0b11110000 | (c >> 18);
buf[1] = 0b10000000 | ((c >> 12) & 0b00111111);
buf[2] = 0b10000000 | ((c >> 6) & 0b00111111);
buf[3] = 0b10000000 | (c & 0b00111111);
return 4;
}
// Read a UTF-8-encoded Unicode code point from a source file.
// We assume that source files are always in UTF-8.
//
// UTF-8 is a variable-width encoding in which one code point is
// encoded in one to four bytes. One byte UTF-8 code points are
// identical to ASCII. Non-ASCII characters are encoded using more
// than one byte.
uint32_t decode_utf8(char **new_pos, char *p) {
if ((unsigned char)*p < 128) {
*new_pos = p + 1;
return *p;
}
char *start = p;
int len;
uint32_t c;
if ((unsigned char)*p >= 0b11110000) {
len = 4;
c = *p & 0b111;
} else if ((unsigned char)*p >= 0b11100000) {
len = 3;
c = *p & 0b1111;
} else if ((unsigned char)*p >= 0b11000000) {
len = 2;
c = *p & 0b11111;
} else {
error_at(start, "invalid UTF-8 sequence");
}
for (int i = 1; i < len; i++) {
if ((unsigned char)p[i] >> 6 != 0b10)
error_at(start, "invalid UTF-8 sequence");
c = (c << 6) | (p[i] & 0b111111);
}
*new_pos = p + len;
return c;
}
static bool in_range(uint32_t *range, uint32_t c) {
for (int i = 0; range[i] != -1; i += 2)
if (range[i] <= c && c <= range[i + 1]) return true;
return false;
}
// C11 allows not only ASCII but some multibyte characters in certan
// Unicode ranges to be used in an identifier. See C11 Annex D for the
// details.
//
// This function returns true if a given character is acceptable as
// the first character of an identifier.
//
// For example, ¾ (U+00BE) is a valid identifier because characters in
// 0x00BE-0x00C0 are allowed, while neither ⟘ (U+27D8) nor ' '
// (U+3000, full-width space) are allowed because they are out of range.
bool is_ident1(uint32_t c) {
static uint32_t range[] = {
'_', '_', 'a', 'z', 'A', 'Z', '$', '$',
0x00A8, 0x00A8, 0x00AA, 0x00AA, 0x00AD, 0x00AD, 0x00AF, 0x00AF,
0x00B2, 0x00B5, 0x00B7, 0x00BA, 0x00BC, 0x00BE, 0x00C0, 0x00D6,
0x00D8, 0x00F6, 0x00F8, 0x00FF, 0x0100, 0x02FF, 0x0370, 0x167F,
0x1681, 0x180D, 0x180F, 0x1DBF, 0x1E00, 0x1FFF, 0x200B, 0x200D,
0x202A, 0x202E, 0x203F, 0x2040, 0x2054, 0x2054, 0x2060, 0x206F,
0x2070, 0x20CF, 0x2100, 0x218F, 0x2460, 0x24FF, 0x2776, 0x2793,
0x2C00, 0x2DFF, 0x2E80, 0x2FFF, 0x3004, 0x3007, 0x3021, 0x302F,
0x3031, 0x303F, 0x3040, 0xD7FF, 0xF900, 0xFD3D, 0xFD40, 0xFDCF,
0xFDF0, 0xFE1F, 0xFE30, 0xFE44, 0xFE47, 0xFFFD, 0x10000, 0x1FFFD,
0x20000, 0x2FFFD, 0x30000, 0x3FFFD, 0x40000, 0x4FFFD, 0x50000, 0x5FFFD,
0x60000, 0x6FFFD, 0x70000, 0x7FFFD, 0x80000, 0x8FFFD, 0x90000, 0x9FFFD,
0xA0000, 0xAFFFD, 0xB0000, 0xBFFFD, 0xC0000, 0xCFFFD, 0xD0000, 0xDFFFD,
0xE0000, 0xEFFFD, -1,
};
return in_range(range, c);
}
// Returns true if a given character is acceptable as a non-first
// character of an identifier.
bool is_ident2(uint32_t c) {
static uint32_t range[] = {
'0', '9', '$', '$', 0x0300, 0x036F, 0x1DC0,
0x1DFF, 0x20D0, 0x20FF, 0xFE20, 0xFE2F, -1,
};
return is_ident1(c) || in_range(range, c);
}
// Returns the number of columns needed to display a given
// character in a fixed-width font.
//
// Based on https://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
static int char_width(uint32_t c) {
static uint32_t range1[] = {
0x0000, 0x001F, 0x007f, 0x00a0, 0x0300, 0x036F, 0x0483, 0x0486,
0x0488, 0x0489, 0x0591, 0x05BD, 0x05BF, 0x05BF, 0x05C1, 0x05C2,
0x05C4, 0x05C5, 0x05C7, 0x05C7, 0x0600, 0x0603, 0x0610, 0x0615,
0x064B, 0x065E, 0x0670, 0x0670, 0x06D6, 0x06E4, 0x06E7, 0x06E8,
0x06EA, 0x06ED, 0x070F, 0x070F, 0x0711, 0x0711, 0x0730, 0x074A,
0x07A6, 0x07B0, 0x07EB, 0x07F3, 0x0901, 0x0902, 0x093C, 0x093C,
0x0941, 0x0948, 0x094D, 0x094D, 0x0951, 0x0954, 0x0962, 0x0963,
0x0981, 0x0981, 0x09BC, 0x09BC, 0x09C1, 0x09C4, 0x09CD, 0x09CD,
0x09E2, 0x09E3, 0x0A01, 0x0A02, 0x0A3C, 0x0A3C, 0x0A41, 0x0A42,
0x0A47, 0x0A48, 0x0A4B, 0x0A4D, 0x0A70, 0x0A71, 0x0A81, 0x0A82,
0x0ABC, 0x0ABC, 0x0AC1, 0x0AC5, 0x0AC7, 0x0AC8, 0x0ACD, 0x0ACD,
0x0AE2, 0x0AE3, 0x0B01, 0x0B01, 0x0B3C, 0x0B3C, 0x0B3F, 0x0B3F,
0x0B41, 0x0B43, 0x0B4D, 0x0B4D, 0x0B56, 0x0B56, 0x0B82, 0x0B82,
0x0BC0, 0x0BC0, 0x0BCD, 0x0BCD, 0x0C3E, 0x0C40, 0x0C46, 0x0C48,
0x0C4A, 0x0C4D, 0x0C55, 0x0C56, 0x0CBC, 0x0CBC, 0x0CBF, 0x0CBF,
0x0CC6, 0x0CC6, 0x0CCC, 0x0CCD, 0x0CE2, 0x0CE3, 0x0D41, 0x0D43,
0x0D4D, 0x0D4D, 0x0DCA, 0x0DCA, 0x0DD2, 0x0DD4, 0x0DD6, 0x0DD6,
0x0E31, 0x0E31, 0x0E34, 0x0E3A, 0x0E47, 0x0E4E, 0x0EB1, 0x0EB1,
0x0EB4, 0x0EB9, 0x0EBB, 0x0EBC, 0x0EC8, 0x0ECD, 0x0F18, 0x0F19,
0x0F35, 0x0F35, 0x0F37, 0x0F37, 0x0F39, 0x0F39, 0x0F71, 0x0F7E,
0x0F80, 0x0F84, 0x0F86, 0x0F87, 0x0F90, 0x0F97, 0x0F99, 0x0FBC,
0x0FC6, 0x0FC6, 0x102D, 0x1030, 0x1032, 0x1032, 0x1036, 0x1037,
0x1039, 0x1039, 0x1058, 0x1059, 0x1160, 0x11FF, 0x135F, 0x135F,
0x1712, 0x1714, 0x1732, 0x1734, 0x1752, 0x1753, 0x1772, 0x1773,
0x17B4, 0x17B5, 0x17B7, 0x17BD, 0x17C6, 0x17C6, 0x17C9, 0x17D3,
0x17DD, 0x17DD, 0x180B, 0x180D, 0x18A9, 0x18A9, 0x1920, 0x1922,
0x1927, 0x1928, 0x1932, 0x1932, 0x1939, 0x193B, 0x1A17, 0x1A18,
0x1B00, 0x1B03, 0x1B34, 0x1B34, 0x1B36, 0x1B3A, 0x1B3C, 0x1B3C,
0x1B42, 0x1B42, 0x1B6B, 0x1B73, 0x1DC0, 0x1DCA, 0x1DFE, 0x1DFF,
0x200B, 0x200F, 0x202A, 0x202E, 0x2060, 0x2063, 0x206A, 0x206F,
0x20D0, 0x20EF, 0x302A, 0x302F, 0x3099, 0x309A, 0xA806, 0xA806,
0xA80B, 0xA80B, 0xA825, 0xA826, 0xFB1E, 0xFB1E, 0xFE00, 0xFE0F,
0xFE20, 0xFE23, 0xFEFF, 0xFEFF, 0xFFF9, 0xFFFB, 0x10A01, 0x10A03,
0x10A05, 0x10A06, 0x10A0C, 0x10A0F, 0x10A38, 0x10A3A, 0x10A3F, 0x10A3F,
0x1D167, 0x1D169, 0x1D173, 0x1D182, 0x1D185, 0x1D18B, 0x1D1AA, 0x1D1AD,
0x1D242, 0x1D244, 0xE0001, 0xE0001, 0xE0020, 0xE007F, 0xE0100, 0xE01EF,
-1,
};
if (in_range(range1, c)) return 0;
static uint32_t range2[] = {
0x1100, 0x115F, 0x2329, 0x2329, 0x232A, 0x232A, 0x2E80, 0x303E,
0x3040, 0xA4CF, 0xAC00, 0xD7A3, 0xF900, 0xFAFF, 0xFE10, 0xFE19,
0xFE30, 0xFE6F, 0xFF00, 0xFF60, 0xFFE0, 0xFFE6, 0x1F000, 0x1F644,
0x20000, 0x2FFFD, 0x30000, 0x3FFFD, -1,
};
if (in_range(range2, c)) return 2;
return 1;
}
// Returns the number of columns needed to display a given
// string in a fixed-width font.
int str_width(char *p, int len) {
char *start = p;
int w = 0;
while (p - start < len) {
uint32_t c = decode_utf8(&p, p);
w += char_width(c);
}
return w;
}