Share file offset across processes

This change ensures that if a file descriptor for an open disk file gets
shared by multiple processes within a process tree, then lseek() changes
will be visible across processes, and read() / write() are synchronized.
Note this only applies to Windows, because UNIX kernels already do this.
This commit is contained in:
Justine Tunney 2024-08-03 01:24:46 -07:00
parent a80ab3f8fe
commit 761c6ad615
No known key found for this signature in database
GPG key ID: BE714B4575D6E328
15 changed files with 256 additions and 63 deletions

View file

@ -17,13 +17,14 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/internal.h"
#include "libc/intrin/fds.h"
#include "libc/calls/syscall-nt.internal.h"
#include "libc/calls/syscall_support-nt.internal.h"
#include "libc/intrin/fds.h"
#include "libc/intrin/weaken.h"
#include "libc/nt/enum/filetype.h"
#include "libc/nt/files.h"
#include "libc/nt/runtime.h"
#include "libc/runtime/runtime.h"
#include "libc/runtime/zipos.internal.h"
#include "libc/sock/syscall_fd.internal.h"
#include "libc/sysv/consts/o.h"
@ -64,5 +65,7 @@ textwindows int sys_close_nt(int fd, int fildes) {
default:
break;
}
if (f->shared && !f->isdup)
munmap(f->shared, sizeof(struct Cursor));
return CloseHandle(f->handle) ? 0 : __winerr();
}

View file

@ -82,6 +82,8 @@ static textwindows int sys_dup_nt_impl(int oldfd, int newfd, int flags,
g_fds.p[newfd] = g_fds.p[oldfd];
g_fds.p[newfd].handle = handle;
g_fds.p[newfd].isdup = true;
g_fds.p[oldfd].isdup = true; // TODO(jart): is it possible to avoid leak?
if (flags & _O_CLOEXEC) {
g_fds.p[newfd].flags |= _O_CLOEXEC;
} else {

View file

@ -20,13 +20,13 @@
#include "libc/calls/calls.h"
#include "libc/calls/createfileflags.internal.h"
#include "libc/calls/internal.h"
#include "libc/intrin/fds.h"
#include "libc/calls/struct/flock.h"
#include "libc/calls/struct/sigset.internal.h"
#include "libc/calls/syscall-nt.internal.h"
#include "libc/calls/syscall_support-nt.internal.h"
#include "libc/calls/wincrash.internal.h"
#include "libc/errno.h"
#include "libc/intrin/fds.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/weaken.h"
#include "libc/limits.h"
@ -151,7 +151,7 @@ static textwindows int sys_fcntl_nt_lock(struct Fd *f, int fd, int cmd,
case SEEK_SET:
break;
case SEEK_CUR:
off = f->pointer + off;
off = f->shared->pointer + off;
break;
case SEEK_END: {
int64_t size;
@ -351,9 +351,14 @@ textwindows int sys_fcntl_nt(int fd, int cmd, uintptr_t arg) {
}
rc = 0;
} else if (cmd == F_SETLK || cmd == F_SETLKW || cmd == F_GETLK) {
struct Fd *f = g_fds.p + fd;
if (f->shared) {
pthread_mutex_lock(&g_locks.mu);
rc = sys_fcntl_nt_lock(g_fds.p + fd, fd, cmd, arg);
rc = sys_fcntl_nt_lock(f, fd, cmd, arg);
pthread_mutex_unlock(&g_locks.mu);
} else {
rc = ebadf();
}
} else if (cmd == F_DUPFD || cmd == F_DUPFD_CLOEXEC) {
rc = sys_fcntl_nt_dupfd(fd, cmd, arg);
} else {

View file

@ -31,7 +31,7 @@ static textwindows int64_t GetPosition(struct Fd *f, int whence) {
case SEEK_SET:
return 0;
case SEEK_CUR:
return f->pointer;
return f->shared->pointer;
case SEEK_END: {
struct NtByHandleFileInformation wst;
if (!GetFileInformationByHandle(f->handle, &wst)) {
@ -67,11 +67,14 @@ textwindows int64_t sys_lseek_nt(int fd, int64_t offset, int whence) {
} else if (__isfdkind(fd, kFdFile)) {
struct Fd *f = g_fds.p + fd;
int filetype = GetFileType(f->handle);
if (filetype != kNtFileTypePipe && filetype != kNtFileTypeChar) {
if (filetype != kNtFileTypePipe && //
filetype != kNtFileTypeChar && //
f->shared) {
int64_t res;
if ((res = Seek(f, offset, whence)) != -1) {
f->pointer = res;
}
__fd_lock(f);
if ((res = Seek(f, offset, whence)) != -1)
f->shared->pointer = res;
__fd_unlock(f);
return res;
} else {
return espipe();

View file

@ -138,6 +138,7 @@ static textwindows int sys_open_nt_file(int dirfd, const char *file,
int64_t handle;
if ((handle = sys_open_nt_impl(dirfd, file, flags, mode,
kNtFileFlagOverlapped)) != -1) {
g_fds.p[fd].shared = __cursor_new();
g_fds.p[fd].handle = handle;
g_fds.p[fd].kind = kFdFile;
g_fds.p[fd].flags = flags;
@ -170,14 +171,14 @@ static textwindows int sys_open_nt_no_handle(int fd, int flags, int mode,
static textwindows int sys_open_nt_dup(int fd, int flags, int mode, int oldfd) {
int64_t handle;
if (!__isfdopen(oldfd)) {
if (!__isfdopen(oldfd))
return enoent();
}
if (DuplicateHandle(GetCurrentProcess(), g_fds.p[oldfd].handle,
GetCurrentProcess(), &handle, 0, true,
kNtDuplicateSameAccess)) {
g_fds.p[fd] = g_fds.p[oldfd];
g_fds.p[fd].handle = handle;
g_fds.p[fd].isdup = true;
g_fds.p[fd].mode = mode;
if (!sys_fcntl_nt_setfl(fd, flags)) {
return fd;

View file

@ -19,9 +19,9 @@
#include "libc/calls/createfileflags.internal.h"
#include "libc/calls/internal.h"
#include "libc/calls/sig.internal.h"
#include "libc/intrin/fds.h"
#include "libc/calls/struct/sigset.h"
#include "libc/calls/syscall_support-nt.internal.h"
#include "libc/intrin/fds.h"
#include "libc/intrin/weaken.h"
#include "libc/nt/enum/filetype.h"
#include "libc/nt/errors.h"
@ -51,20 +51,19 @@ sys_readwrite_nt(int fd, void *data, size_t size, ssize_t offset,
uint32_t exchanged;
struct Fd *f = g_fds.p + fd;
// win32 i/o apis generally take 32-bit values thus we implicitly
// truncate outrageously large sizes. linux actually does it too!
size = MIN(size, 0x7ffff000);
// pread() and pwrite() perform an implicit lseek() operation, so
// similar to the lseek() system call, they too raise ESPIPE when
// operating on a non-seekable file.
bool pwriting = offset != -1;
bool seekable =
(f->kind == kFdFile && GetFileType(handle) == kNtFileTypeDisk) ||
f->kind == kFdDevNull || f->kind == kFdDevRandom;
if (pwriting && !seekable) {
bool isdisk = f->kind == kFdFile && GetFileType(handle) == kNtFileTypeDisk;
bool seekable = isdisk || f->kind == kFdDevNull || f->kind == kFdDevRandom;
if (pwriting && !seekable)
return espipe();
}
// determine if we need to lock a file descriptor across processes
bool locked = isdisk && !pwriting && f->shared;
if (locked)
__fd_lock(f);
// when a file is opened in overlapped mode win32 requires that we
// take over full responsibility for managing our own file pointer
@ -72,8 +71,8 @@ sys_readwrite_nt(int fd, void *data, size_t size, ssize_t offset,
// the sense that it behaves so differently from linux, that using
// win32 i/o required more compatibilty toil than doing it by hand
if (!pwriting) {
if (seekable) {
offset = f->pointer;
if (seekable && f->shared) {
offset = f->shared->pointer;
} else {
offset = 0;
}
@ -82,8 +81,11 @@ sys_readwrite_nt(int fd, void *data, size_t size, ssize_t offset,
RestartOperation:
bool eagained = false;
// check for signals and cancelation
if (_check_cancel() == -1)
if (_check_cancel() == -1) {
if (locked)
__fd_unlock(f);
return -1; // ECANCELED
}
if (_weaken(__sig_get) && (sig = _weaken(__sig_get)(waitmask))) {
goto HandleInterrupt;
}
@ -114,16 +116,16 @@ RestartOperation:
}
ok = true;
}
if (ok) {
if (ok)
ok = GetOverlappedResult(handle, &overlap, &exchanged, true);
}
CloseHandle(overlap.hEvent);
// if i/o succeeded then return its result
if (ok) {
if (!pwriting && seekable) {
f->pointer = offset + exchanged;
}
if (!pwriting && seekable && f->shared)
f->shared->pointer = offset + exchanged;
if (locked)
__fd_unlock(f);
return exchanged;
}
@ -131,23 +133,32 @@ RestartOperation:
if (GetLastError() == kNtErrorOperationAborted) {
// raise EAGAIN if it's due to O_NONBLOCK mmode
if (eagained) {
if (locked)
__fd_unlock(f);
return eagain();
}
// otherwise it must be due to a kill() via __sig_cancel()
if (_weaken(__sig_relay) && (sig = _weaken(__sig_get)(waitmask))) {
HandleInterrupt:
if (locked)
__fd_unlock(f);
int handler_was_called = _weaken(__sig_relay)(sig, SI_KERNEL, waitmask);
if (_check_cancel() == -1)
return -1; // possible if we SIGTHR'd
if (locked)
__fd_lock(f);
// read() is @restartable unless non-SA_RESTART hands were called
if (!(handler_was_called & SIG_HANDLED_NO_RESTART)) {
if (!(handler_was_called & SIG_HANDLED_NO_RESTART))
goto RestartOperation;
}
}
if (locked)
__fd_unlock(f);
return eintr();
}
// read() and write() have generally different error-handling paths
if (locked)
__fd_unlock(f);
return -2;
}

View file

@ -16,13 +16,13 @@
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/intrin/fds.h"
#include "libc/calls/internal.h"
#include "libc/calls/state.internal.h"
#include "libc/calls/ttydefaults.h"
#include "libc/dce.h"
#include "libc/intrin/atomic.h"
#include "libc/intrin/extend.h"
#include "libc/intrin/fds.h"
#include "libc/intrin/kprintf.h"
#include "libc/intrin/nomultics.h"
#include "libc/intrin/pushpop.h"
@ -40,6 +40,7 @@
#include "libc/sock/sock.h"
#include "libc/sysv/consts/map.h"
#include "libc/sysv/consts/o.h"
#include "libc/sysv/consts/prot.h"
#include "libc/thread/thread.h"
#define OPEN_MAX 16
@ -156,12 +157,29 @@ textstartup void __init_fds(int argc, char **argv, char **envp) {
f->kind = kind;
f->flags = flags;
f->mode = mode;
f->pointer = pointer;
f->type = type;
f->family = family;
f->protocol = protocol;
atomic_store_explicit(&fds->f, fd + 1, memory_order_relaxed);
//
// - v1 abi: This field was originally the file pointer.
//
// - v2 abi: This field is the negated shared memory address.
//
if (f->kind == kFdFile) {
if (pointer < 0) {
f->shared = (struct Cursor *)(uintptr_t)-pointer;
} else if ((f->shared = __cursor_new())) {
f->shared->pointer = pointer;
}
}
}
}
for (int i = 0; i < 3; ++i) {
struct Fd *f = fds->p + i;
if (f->kind == kFdFile && !f->shared)
f->shared = __cursor_new();
}
}
}

View file

@ -1,5 +1,7 @@
#ifndef COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_
#define COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_
#include "libc/atomic.h"
#include "libc/thread/thread.h"
COSMOPOLITAN_C_START_
#define kFdEmpty 0
@ -13,19 +15,25 @@ COSMOPOLITAN_C_START_
#define kFdDevNull 9
#define kFdDevRandom 10
struct Cursor {
pthread_mutex_t lock;
long pointer;
};
struct Fd {
char kind;
bool isdup;
bool isbound;
unsigned flags;
unsigned mode;
long handle;
long pointer;
int family;
int type;
int protocol;
unsigned rcvtimeo; /* millis; 0 means wait forever */
unsigned sndtimeo; /* millis; 0 means wait forever */
void *connect_op;
struct Cursor *shared;
};
struct Fds {
@ -34,5 +42,9 @@ struct Fds {
struct Fd *p, *e;
};
void __fd_lock(struct Fd *);
void __fd_unlock(struct Fd *);
struct Cursor *__cursor_new(void);
COSMOPOLITAN_C_END_
#endif /* COSMOPOLITAN_LIBC_CALLS_STRUCT_FD_INTERNAL_H_ */

View file

@ -17,6 +17,8 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/calls/state.internal.h"
#include "libc/intrin/fds.h"
#include "libc/runtime/runtime.h"
#include "libc/thread/thread.h"
void __fds_lock(void) {
@ -26,3 +28,23 @@ void __fds_lock(void) {
void __fds_unlock(void) {
pthread_mutex_unlock(&__fds_lock_obj);
}
void __fd_lock(struct Fd *f) {
pthread_mutex_lock(&f->shared->lock);
}
void __fd_unlock(struct Fd *f) {
pthread_mutex_unlock(&f->shared->lock);
}
struct Cursor *__cursor_new(void) {
struct Cursor *c;
if ((c = _mapshared(sizeof(struct Cursor)))) {
pthread_mutexattr_t attr;
pthread_mutexattr_init(&attr);
pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED);
pthread_mutex_init(&c->lock, &attr);
pthread_mutexattr_destroy(&attr);
}
return c;
}

View file

@ -17,9 +17,9 @@
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/assert.h"
#include "libc/intrin/fds.h"
#include "libc/calls/syscall_support-nt.internal.h"
#include "libc/fmt/itoa.h"
#include "libc/intrin/fds.h"
#include "libc/intrin/strace.h"
#include "libc/mem/mem.h"
#include "libc/nt/files.h"
@ -151,7 +151,12 @@ textwindows char *__describe_fds(const struct Fd *fds, size_t fdslen,
*p++ = '_';
p = FormatInt64(p, f->mode);
*p++ = '_';
p = FormatInt64(p, f->pointer);
//
// - v1 abi: This field was originally the file pointer.
//
// - v2 abi: This field is the negated shared memory address.
//
p = FormatInt64(p, -(uintptr_t)f->shared);
*p++ = '_';
p = FormatInt64(p, f->type);
*p++ = '_';

View file

@ -62,9 +62,10 @@ static dontinline textwindows ssize_t sys_sendfile_nt(
int outfd, int infd, int64_t *opt_in_out_inoffset, uint32_t uptobytes) {
ssize_t rc;
uint32_t flags = 0;
bool locked = false;
int64_t ih, oh, eof, offset;
struct NtByHandleFileInformation wst;
if (!__isfdkind(infd, kFdFile))
if (!__isfdkind(infd, kFdFile) || !g_fds.p[infd].shared)
return ebadf();
if (!__isfdkind(outfd, kFdSocket))
return ebadf();
@ -73,7 +74,9 @@ static dontinline textwindows ssize_t sys_sendfile_nt(
if (opt_in_out_inoffset) {
offset = *opt_in_out_inoffset;
} else {
offset = g_fds.p[infd].pointer;
locked = true;
__fd_lock(&g_fds.p[infd]);
offset = g_fds.p[infd].shared->pointer;
}
if (GetFileInformationByHandle(ih, &wst)) {
// TransmitFile() returns EINVAL if `uptobytes` goes past EOF.
@ -82,9 +85,10 @@ static dontinline textwindows ssize_t sys_sendfile_nt(
uptobytes = eof - offset;
}
} else {
if (locked)
__fd_unlock(&g_fds.p[infd]);
return ebadf();
}
BLOCK_SIGNALS;
struct NtOverlapped ov = {.hEvent = WSACreateEvent(), .Pointer = offset};
cosmo_once(&g_transmitfile.once, transmitfile_init);
if (g_transmitfile.lpTransmitFile(oh, ih, uptobytes, 0, &ov, 0, 0) ||
@ -95,7 +99,7 @@ static dontinline textwindows ssize_t sys_sendfile_nt(
if (opt_in_out_inoffset) {
*opt_in_out_inoffset = offset + rc;
} else {
g_fds.p[infd].pointer = offset + rc;
g_fds.p[infd].shared->pointer = offset + rc;
}
} else {
rc = __winsockerr();
@ -103,8 +107,9 @@ static dontinline textwindows ssize_t sys_sendfile_nt(
} else {
rc = __winsockerr();
}
if (locked)
__fd_unlock(&g_fds.p[infd]);
WSACloseEvent(ov.hEvent);
ALLOW_SIGNALS;
return rc;
}
@ -186,7 +191,9 @@ ssize_t sendfile(int outfd, int infd, int64_t *opt_in_out_inoffset,
} else if (IsFreebsd() || IsXnu()) {
rc = sys_sendfile_bsd(outfd, infd, opt_in_out_inoffset, uptobytes);
} else if (IsWindows()) {
BLOCK_SIGNALS;
rc = sys_sendfile_nt(outfd, infd, opt_in_out_inoffset, uptobytes);
ALLOW_SIGNALS;
} else {
rc = enosys();
}

View file

@ -185,8 +185,8 @@ TEST(munmap, tinyFile_preciseUnmapSize) {
TEST(munmap, tinyFile_mapThriceUnmapOnce) {
char *p;
ASSERT_NE(MAP_FAILED, (p = mmap(0, gransz*5, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0)));
ASSERT_SYS(0, 0, munmap(p, gransz*5));
ASSERT_SYS(0, 3, open("doge", O_RDWR | O_CREAT | O_TRUNC, 0644));
ASSERT_SYS(0, 0, munmap(p, gransz*5));
ASSERT_SYS (0, 5, write(3, "hello", 5));
ASSERT_EQ(p+gransz*0, mmap(p+gransz*0, gransz, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0));
ASSERT_EQ(p+gransz*1, mmap(p+gransz*1, 5, PROT_READ, MAP_PRIVATE|MAP_FIXED, 3, 0));

View file

@ -1,22 +1,19 @@
/*-*- mode:c;indent-tabs-mode:nil;c-basic-offset:2;tab-width:8;coding:utf-8 -*-│
vi: set et ft=c ts=2 sts=2 sw=2 fenc=utf-8 :vi
Copyright 2024 Justine Alexandra Roberts Tunney
Permission to use, copy, modify, and/or distribute this software for
any purpose with or without fee is hereby granted, provided that the
above copyright notice and this permission notice appear in all copies.
THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
PERFORMANCE OF THIS SOFTWARE.
*/
#include "libc/math.h"
// Copyright 2024 Justine Alexandra Roberts Tunney
//
// Permission to use, copy, modify, and/or distribute this software for
// any purpose with or without fee is hereby granted, provided that the
// above copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.
#include <math.h>
#define CHECK(x) \
if (!(x)) \

View file

@ -0,0 +1,107 @@
// Copyright 2024 Justine Alexandra Roberts Tunney
//
// Permission to use, copy, modify, and/or distribute this software for
// any purpose with or without fee is hereby granted, provided that the
// above copyright notice and this permission notice appear in all copies.
//
// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL
// WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED
// WARRANTIES OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE
// AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
// DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
// PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
// TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
// PERFORMANCE OF THIS SOFTWARE.
#include <stdatomic.h>
#include <stdlib.h>
#include <sys/mman.h>
#include <unistd.h>
// test that file offset is shared across multiple processes
atomic_int *phase;
int main() {
if ((phase = mmap(0, sizeof(atomic_int), PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0)) == MAP_FAILED)
return 1;
int fd;
char path[] = "/tmp/fd_test.XXXXXX";
if ((fd = mkstemp(path)) == -1)
return 2;
if (lseek(fd, 0, SEEK_CUR) != 0)
return 22;
if (write(fd, "0", 1) != 1)
return 3;
if (lseek(fd, 0, SEEK_CUR) != 1)
return 33;
int pid;
if ((pid = fork()) == -1)
return 4;
if (!pid) {
if (write(fd, "1", 1) != 1)
_Exit(100);
if (lseek(fd, 0, SEEK_CUR) != 2)
_Exit(101);
*phase = 1;
for (;;)
if (*phase == 2)
break;
if (write(fd, "3", 1) != 1)
_Exit(102);
if (lseek(fd, 0, SEEK_CUR) != 4)
_Exit(103);
*phase = 3;
_Exit(0);
}
for (;;)
if (*phase == 1)
break;
if (write(fd, "2", 1) != 1)
return 5;
if (lseek(fd, 0, SEEK_CUR) != 3)
return 55;
*phase = 2;
for (;;)
if (*phase == 3)
break;
if (write(fd, "4", 1) != 1)
return 6;
if (lseek(fd, 0, SEEK_CUR) != 5)
return 66;
int ws;
if (wait(&ws) == -1)
return 7;
if (!WIFEXITED(ws))
return 8;
if (WEXITSTATUS(ws))
return WEXITSTATUS(ws);
char buf[16] = {0};
if (pread(fd, buf, 15, 0) != 5)
return 12;
if (lseek(fd, 0, SEEK_CUR) != 5)
return 77;
if (close(fd))
return 13;
if (unlink(path))
return 14;
if (strcmp(buf, "01234"))
return 15;
}