mirror of
https://github.com/torvalds/linux.git
synced 2026-06-07 22:14:04 +02:00
Linux 5.10
-----BEGIN PGP SIGNATURE----- iQFSBAABCAA8FiEEq68RxlopcLEwq+PEeb4+QwBBGIYFAl/WmJoeHHRvcnZhbGRz QGxpbnV4LWZvdW5kYXRpb24ub3JnAAoJEHm+PkMAQRiGv1oH+wUPC+TflsJyjjG4 /UrmkfQGkMJlUuZbiuLSgLys2AxpiKE81qKpqi9+Fgv3TLZs1WZbWB4Aspj1SM9d dVHNz1RXqUKZVNLrBdB/NyiBDWO/XJyNe14DFJa7VQJzlPYheIr7Nij5MPt0ipEQ ZXNiXrVSC511fvLJ+74kQFXKohJ1LvxKO9tXHwn+RnTdQf+Rdat17BKEkxgjzEJr JV8KpzXeUHfAJInNye9YaZFSy2IY7PDBFbR4BAd/jRnqY58q71w0pVb7k1hWphT8 XtzJbR1CHpriBuTG5YrFtSPUbkkPNIXcXa7v1Jo7kscJypjoIYbkfVDU++UbN9j/ ipzLI5o= =MX0J -----END PGP SIGNATURE----- Merge tag 'v5.10' into android-mainline Linux 5.10 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I5cff928adff4f083fec6db63cee0319f1e212bee
This commit is contained in:
commit
b9e54f8e02
2
Makefile
2
Makefile
|
|
@ -2,7 +2,7 @@
|
|||
VERSION = 5
|
||||
PATCHLEVEL = 10
|
||||
SUBLEVEL = 0
|
||||
EXTRAVERSION = -rc7
|
||||
EXTRAVERSION =
|
||||
NAME = Kleptomaniac Octopus
|
||||
|
||||
# *DOCUMENTATION*
|
||||
|
|
|
|||
|
|
@ -155,6 +155,7 @@ enum page_cache_mode {
|
|||
#define _PAGE_ENC (_AT(pteval_t, sme_me_mask))
|
||||
|
||||
#define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT)
|
||||
#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE)
|
||||
|
||||
#define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC))
|
||||
#define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP))
|
||||
|
|
|
|||
|
|
@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void)
|
|||
/* With PTI, we unconditionally serialize before running user code. */
|
||||
if (static_cpu_has(X86_FEATURE_PTI))
|
||||
return;
|
||||
|
||||
/*
|
||||
* Return from interrupt and NMI is done through iret, which is core
|
||||
* serializing.
|
||||
* Even if we're in an interrupt, we might reschedule before returning,
|
||||
* in which case we could switch to a different thread in the same mm
|
||||
* and return using SYSRET or SYSEXIT. Instead of trying to keep
|
||||
* track of our need to sync the core, just sync right away.
|
||||
*/
|
||||
if (in_irq() || in_nmi())
|
||||
return;
|
||||
sync_core();
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd)
|
|||
const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd);
|
||||
int node = irq_data_get_node(irqd);
|
||||
|
||||
if (node == NUMA_NO_NODE)
|
||||
goto all;
|
||||
/* Try the intersection of @affmsk and node mask */
|
||||
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
|
||||
if (!assign_vector_locked(irqd, vector_searchmask))
|
||||
return 0;
|
||||
/* Try the node mask */
|
||||
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
|
||||
return 0;
|
||||
all:
|
||||
if (node != NUMA_NO_NODE) {
|
||||
/* Try the intersection of @affmsk and node mask */
|
||||
cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk);
|
||||
if (!assign_vector_locked(irqd, vector_searchmask))
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Try the full affinity mask */
|
||||
cpumask_and(vector_searchmask, affmsk, cpu_online_mask);
|
||||
if (!assign_vector_locked(irqd, vector_searchmask))
|
||||
return 0;
|
||||
|
||||
if (node != NUMA_NO_NODE) {
|
||||
/* Try the node mask */
|
||||
if (!assign_vector_locked(irqd, cpumask_of_node(node)))
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* Try the full online mask */
|
||||
return assign_vector_locked(irqd, cpu_online_mask);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr)
|
|||
return;
|
||||
|
||||
chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width);
|
||||
m->chunks += chunks;
|
||||
cur_bw = (chunks * r->mon_scale) >> 20;
|
||||
|
||||
if (m->delta_comp)
|
||||
|
|
@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid)
|
|||
}
|
||||
if (is_mbm_local_enabled()) {
|
||||
rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
|
||||
__mon_event_count(rmid, &rr);
|
||||
|
||||
/*
|
||||
* Call the MBA software controller only for the
|
||||
* control groups and when user has enabled
|
||||
* the software controller explicitly.
|
||||
*/
|
||||
if (!is_mba_sc(NULL))
|
||||
__mon_event_count(rmid, &rr);
|
||||
else
|
||||
if (is_mba_sc(NULL))
|
||||
mbm_bw_count(rmid, &rr);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn)
|
|||
return ret;
|
||||
}
|
||||
|
||||
static bool is_padding_int3(unsigned long addr, unsigned long eaddr)
|
||||
{
|
||||
unsigned char ops;
|
||||
|
||||
for (; addr < eaddr; addr++) {
|
||||
if (get_kernel_nofault(ops, (void *)addr) < 0 ||
|
||||
ops != INT3_INSN_OPCODE)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/* Decode whole function to ensure any instructions don't jump into target */
|
||||
static int can_optimize(unsigned long paddr)
|
||||
{
|
||||
|
|
@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr)
|
|||
return 0;
|
||||
kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
|
||||
insn_get_length(&insn);
|
||||
/* Another subsystem puts a breakpoint */
|
||||
/*
|
||||
* In the case of detecting unknown breakpoint, this could be
|
||||
* a padding INT3 between functions. Let's check that all the
|
||||
* rest of the bytes are also INT3.
|
||||
*/
|
||||
if (insn.opcode.bytes[0] == INT3_INSN_OPCODE)
|
||||
return 0;
|
||||
return is_padding_int3(addr, paddr - offset + size) ? 1 : 0;
|
||||
|
||||
/* Recover address */
|
||||
insn.kaddr = (void *)addr;
|
||||
insn.next_byte = (void *)(addr + insn.length);
|
||||
|
|
|
|||
|
|
@ -45,8 +45,8 @@
|
|||
#define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL)
|
||||
|
||||
#define PMD_FLAGS_DEC PMD_FLAGS_LARGE
|
||||
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \
|
||||
(_PAGE_PAT | _PAGE_PWT))
|
||||
#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \
|
||||
(_PAGE_PAT_LARGE | _PAGE_PWT))
|
||||
|
||||
#define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC)
|
||||
|
||||
|
|
|
|||
|
|
@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
/*
|
||||
* The membarrier system call requires a full memory barrier and
|
||||
* core serialization before returning to user-space, after
|
||||
* storing to rq->curr. Writing to CR3 provides that full
|
||||
* memory barrier and core serializing instruction.
|
||||
* storing to rq->curr, when changing mm. This is because
|
||||
* membarrier() sends IPIs to all CPUs that are in the target mm
|
||||
* to make them issue memory barriers. However, if another CPU
|
||||
* switches to/from the target mm concurrently with
|
||||
* membarrier(), it can cause that CPU not to receive an IPI
|
||||
* when it really should issue a memory barrier. Writing to CR3
|
||||
* provides that full memory barrier and core serializing
|
||||
* instruction.
|
||||
*/
|
||||
if (real_prev == next) {
|
||||
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) !=
|
||||
|
|
|
|||
|
|
@ -3728,6 +3728,17 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits)
|
|||
|
||||
blk_limits_io_min(limits, chunk_size_bytes);
|
||||
blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs));
|
||||
|
||||
/*
|
||||
* RAID10 personality requires bio splitting,
|
||||
* RAID0/1/4/5/6 don't and process large discard bios properly.
|
||||
*/
|
||||
if (rs_is_raid10(rs)) {
|
||||
limits->discard_granularity = max(chunk_size_bytes,
|
||||
limits->discard_granularity);
|
||||
limits->max_discard_sectors = min_not_zero(rs->md.chunk_sectors,
|
||||
limits->max_discard_sectors);
|
||||
}
|
||||
}
|
||||
|
||||
static void raid_postsuspend(struct dm_target *ti)
|
||||
|
|
|
|||
|
|
@ -8582,26 +8582,6 @@ void md_write_end(struct mddev *mddev)
|
|||
|
||||
EXPORT_SYMBOL(md_write_end);
|
||||
|
||||
/* This is used by raid0 and raid10 */
|
||||
void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
|
||||
struct bio *bio, sector_t start, sector_t size)
|
||||
{
|
||||
struct bio *discard_bio = NULL;
|
||||
|
||||
if (__blkdev_issue_discard(rdev->bdev, start, size,
|
||||
GFP_NOIO, 0, &discard_bio) || !discard_bio)
|
||||
return;
|
||||
|
||||
bio_chain(discard_bio, bio);
|
||||
bio_clone_blkg_association(discard_bio, bio);
|
||||
if (mddev->gendisk)
|
||||
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
|
||||
discard_bio, disk_devt(mddev->gendisk),
|
||||
bio->bi_iter.bi_sector);
|
||||
submit_bio_noacct(discard_bio);
|
||||
}
|
||||
EXPORT_SYMBOL(md_submit_discard_bio);
|
||||
|
||||
/* md_allow_write(mddev)
|
||||
* Calling this ensures that the array is marked 'active' so that writes
|
||||
* may proceed without blocking. It is important to call this before
|
||||
|
|
|
|||
|
|
@ -311,7 +311,7 @@ struct mddev {
|
|||
int external; /* metadata is
|
||||
* managed externally */
|
||||
char metadata_type[17]; /* externally set*/
|
||||
int chunk_sectors;
|
||||
unsigned int chunk_sectors;
|
||||
time64_t ctime, utime;
|
||||
int level, layout;
|
||||
char clevel[16];
|
||||
|
|
@ -339,7 +339,7 @@ struct mddev {
|
|||
*/
|
||||
sector_t reshape_position;
|
||||
int delta_disks, new_level, new_layout;
|
||||
int new_chunk_sectors;
|
||||
unsigned int new_chunk_sectors;
|
||||
int reshape_backwards;
|
||||
|
||||
struct md_thread *thread; /* management thread */
|
||||
|
|
@ -713,8 +713,6 @@ extern void md_write_end(struct mddev *mddev);
|
|||
extern void md_done_sync(struct mddev *mddev, int blocks, int ok);
|
||||
extern void md_error(struct mddev *mddev, struct md_rdev *rdev);
|
||||
extern void md_finish_reshape(struct mddev *mddev);
|
||||
extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev,
|
||||
struct bio *bio, sector_t start, sector_t size);
|
||||
|
||||
extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio);
|
||||
extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
|
||||
|
|
|
|||
|
|
@ -477,6 +477,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
|||
|
||||
for (disk = 0; disk < zone->nb_dev; disk++) {
|
||||
sector_t dev_start, dev_end;
|
||||
struct bio *discard_bio = NULL;
|
||||
struct md_rdev *rdev;
|
||||
|
||||
if (disk < start_disk_index)
|
||||
|
|
@ -499,9 +500,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio)
|
|||
|
||||
rdev = conf->devlist[(zone - conf->strip_zone) *
|
||||
conf->strip_zone[0].nb_dev + disk];
|
||||
md_submit_discard_bio(mddev, rdev, bio,
|
||||
if (__blkdev_issue_discard(rdev->bdev,
|
||||
dev_start + zone->dev_start + rdev->data_offset,
|
||||
dev_end - dev_start);
|
||||
dev_end - dev_start, GFP_NOIO, 0, &discard_bio) ||
|
||||
!discard_bio)
|
||||
continue;
|
||||
bio_chain(discard_bio, bio);
|
||||
bio_clone_blkg_association(discard_bio, bio);
|
||||
if (mddev->gendisk)
|
||||
trace_block_bio_remap(bdev_get_queue(rdev->bdev),
|
||||
discard_bio, disk_devt(mddev->gendisk),
|
||||
bio->bi_iter.bi_sector);
|
||||
submit_bio_noacct(discard_bio);
|
||||
}
|
||||
bio_endio(bio);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio)
|
|||
static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
|
||||
{
|
||||
struct r10conf *conf = data;
|
||||
int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]);
|
||||
int size = offsetof(struct r10bio, devs[conf->copies]);
|
||||
|
||||
/* allocate a r10bio with room for raid_disks entries in the
|
||||
* bios array */
|
||||
|
|
@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio)
|
|||
{
|
||||
int i;
|
||||
|
||||
for (i = 0; i < conf->geo.raid_disks; i++) {
|
||||
for (i = 0; i < conf->copies; i++) {
|
||||
struct bio **bio = & r10_bio->devs[i].bio;
|
||||
if (!BIO_SPECIAL(*bio))
|
||||
bio_put(*bio);
|
||||
|
|
@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
|
|||
int slot;
|
||||
int repl = 0;
|
||||
|
||||
for (slot = 0; slot < conf->geo.raid_disks; slot++) {
|
||||
for (slot = 0; slot < conf->copies; slot++) {
|
||||
if (r10_bio->devs[slot].bio == bio)
|
||||
break;
|
||||
if (r10_bio->devs[slot].repl_bio == bio) {
|
||||
|
|
@ -336,6 +336,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio,
|
|||
}
|
||||
}
|
||||
|
||||
BUG_ON(slot == conf->copies);
|
||||
update_head_pos(slot, r10_bio);
|
||||
|
||||
if (slotp)
|
||||
|
|
@ -1275,75 +1276,12 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
|
|||
}
|
||||
}
|
||||
|
||||
static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio)
|
||||
{
|
||||
int i;
|
||||
struct r10conf *conf = mddev->private;
|
||||
struct md_rdev *blocked_rdev;
|
||||
|
||||
retry_wait:
|
||||
blocked_rdev = NULL;
|
||||
rcu_read_lock();
|
||||
for (i = 0; i < conf->copies; i++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
|
||||
struct md_rdev *rrdev = rcu_dereference(
|
||||
conf->mirrors[i].replacement);
|
||||
if (rdev == rrdev)
|
||||
rrdev = NULL;
|
||||
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
blocked_rdev = rdev;
|
||||
break;
|
||||
}
|
||||
if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
blocked_rdev = rrdev;
|
||||
break;
|
||||
}
|
||||
|
||||
if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) {
|
||||
sector_t first_bad;
|
||||
sector_t dev_sector = r10_bio->devs[i].addr;
|
||||
int bad_sectors;
|
||||
int is_bad;
|
||||
|
||||
/* Discard request doesn't care the write result
|
||||
* so it doesn't need to wait blocked disk here.
|
||||
*/
|
||||
if (!r10_bio->sectors)
|
||||
continue;
|
||||
|
||||
is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors,
|
||||
&first_bad, &bad_sectors);
|
||||
if (is_bad < 0) {
|
||||
/* Mustn't write here until the bad block
|
||||
* is acknowledged
|
||||
*/
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
set_bit(BlockedBadBlocks, &rdev->flags);
|
||||
blocked_rdev = rdev;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(blocked_rdev)) {
|
||||
/* Have to wait for this device to get unblocked, then retry */
|
||||
allow_barrier(conf);
|
||||
raid10_log(conf->mddev, "%s wait rdev %d blocked",
|
||||
__func__, blocked_rdev->raid_disk);
|
||||
md_wait_for_blocked_rdev(blocked_rdev, mddev);
|
||||
wait_barrier(conf);
|
||||
goto retry_wait;
|
||||
}
|
||||
}
|
||||
|
||||
static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
||||
struct r10bio *r10_bio)
|
||||
{
|
||||
struct r10conf *conf = mddev->private;
|
||||
int i;
|
||||
struct md_rdev *blocked_rdev;
|
||||
sector_t sectors;
|
||||
int max_sectors;
|
||||
|
||||
|
|
@ -1401,9 +1339,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
|||
|
||||
r10_bio->read_slot = -1; /* make sure repl_bio gets freed */
|
||||
raid10_find_phys(conf, r10_bio);
|
||||
|
||||
wait_blocked_dev(mddev, r10_bio);
|
||||
|
||||
retry_write:
|
||||
blocked_rdev = NULL;
|
||||
rcu_read_lock();
|
||||
max_sectors = r10_bio->sectors;
|
||||
|
||||
|
|
@ -1414,6 +1351,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
|||
conf->mirrors[d].replacement);
|
||||
if (rdev == rrdev)
|
||||
rrdev = NULL;
|
||||
if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
blocked_rdev = rdev;
|
||||
break;
|
||||
}
|
||||
if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) {
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
blocked_rdev = rrdev;
|
||||
break;
|
||||
}
|
||||
if (rdev && (test_bit(Faulty, &rdev->flags)))
|
||||
rdev = NULL;
|
||||
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
|
||||
|
|
@ -1434,6 +1381,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
|||
|
||||
is_bad = is_badblock(rdev, dev_sector, max_sectors,
|
||||
&first_bad, &bad_sectors);
|
||||
if (is_bad < 0) {
|
||||
/* Mustn't write here until the bad block
|
||||
* is acknowledged
|
||||
*/
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
set_bit(BlockedBadBlocks, &rdev->flags);
|
||||
blocked_rdev = rdev;
|
||||
break;
|
||||
}
|
||||
if (is_bad && first_bad <= dev_sector) {
|
||||
/* Cannot write here at all */
|
||||
bad_sectors -= (dev_sector - first_bad);
|
||||
|
|
@ -1469,6 +1425,35 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio,
|
|||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (unlikely(blocked_rdev)) {
|
||||
/* Have to wait for this device to get unblocked, then retry */
|
||||
int j;
|
||||
int d;
|
||||
|
||||
for (j = 0; j < i; j++) {
|
||||
if (r10_bio->devs[j].bio) {
|
||||
d = r10_bio->devs[j].devnum;
|
||||
rdev_dec_pending(conf->mirrors[d].rdev, mddev);
|
||||
}
|
||||
if (r10_bio->devs[j].repl_bio) {
|
||||
struct md_rdev *rdev;
|
||||
d = r10_bio->devs[j].devnum;
|
||||
rdev = conf->mirrors[d].replacement;
|
||||
if (!rdev) {
|
||||
/* Race with remove_disk */
|
||||
smp_mb();
|
||||
rdev = conf->mirrors[d].rdev;
|
||||
}
|
||||
rdev_dec_pending(rdev, mddev);
|
||||
}
|
||||
}
|
||||
allow_barrier(conf);
|
||||
raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk);
|
||||
md_wait_for_blocked_rdev(blocked_rdev, mddev);
|
||||
wait_barrier(conf);
|
||||
goto retry_write;
|
||||
}
|
||||
|
||||
if (max_sectors < r10_bio->sectors)
|
||||
r10_bio->sectors = max_sectors;
|
||||
|
||||
|
|
@ -1508,7 +1493,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
|
|||
r10_bio->mddev = mddev;
|
||||
r10_bio->sector = bio->bi_iter.bi_sector;
|
||||
r10_bio->state = 0;
|
||||
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks);
|
||||
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies);
|
||||
|
||||
if (bio_data_dir(bio) == READ)
|
||||
raid10_read_request(mddev, bio, r10_bio);
|
||||
|
|
@ -1516,296 +1501,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors)
|
|||
raid10_write_request(mddev, bio, r10_bio);
|
||||
}
|
||||
|
||||
static struct bio *raid10_split_bio(struct r10conf *conf,
|
||||
struct bio *bio, sector_t sectors, bool want_first)
|
||||
{
|
||||
struct bio *split;
|
||||
|
||||
split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split);
|
||||
bio_chain(split, bio);
|
||||
allow_barrier(conf);
|
||||
if (want_first) {
|
||||
submit_bio_noacct(bio);
|
||||
bio = split;
|
||||
} else
|
||||
submit_bio_noacct(split);
|
||||
wait_barrier(conf);
|
||||
|
||||
return bio;
|
||||
}
|
||||
|
||||
static void raid_end_discard_bio(struct r10bio *r10bio)
|
||||
{
|
||||
struct r10conf *conf = r10bio->mddev->private;
|
||||
struct r10bio *first_r10bio;
|
||||
|
||||
while (atomic_dec_and_test(&r10bio->remaining)) {
|
||||
|
||||
allow_barrier(conf);
|
||||
|
||||
if (!test_bit(R10BIO_Discard, &r10bio->state)) {
|
||||
first_r10bio = (struct r10bio *)r10bio->master_bio;
|
||||
free_r10bio(r10bio);
|
||||
r10bio = first_r10bio;
|
||||
} else {
|
||||
md_write_end(r10bio->mddev);
|
||||
bio_endio(r10bio->master_bio);
|
||||
free_r10bio(r10bio);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static void raid10_end_discard_request(struct bio *bio)
|
||||
{
|
||||
struct r10bio *r10_bio = bio->bi_private;
|
||||
struct r10conf *conf = r10_bio->mddev->private;
|
||||
struct md_rdev *rdev = NULL;
|
||||
int dev;
|
||||
int slot, repl;
|
||||
|
||||
/*
|
||||
* We don't care the return value of discard bio
|
||||
*/
|
||||
if (!test_bit(R10BIO_Uptodate, &r10_bio->state))
|
||||
set_bit(R10BIO_Uptodate, &r10_bio->state);
|
||||
|
||||
dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
|
||||
if (repl)
|
||||
rdev = conf->mirrors[dev].replacement;
|
||||
if (!rdev) {
|
||||
/* raid10_remove_disk uses smp_mb to make sure rdev is set to
|
||||
* replacement before setting replacement to NULL. It can read
|
||||
* rdev first without barrier protect even replacment is NULL
|
||||
*/
|
||||
smp_rmb();
|
||||
rdev = conf->mirrors[dev].rdev;
|
||||
}
|
||||
|
||||
raid_end_discard_bio(r10_bio);
|
||||
rdev_dec_pending(rdev, conf->mddev);
|
||||
}
|
||||
|
||||
/* There are some limitations to handle discard bio
|
||||
* 1st, the discard size is bigger than stripe_size*2.
|
||||
* 2st, if the discard bio spans reshape progress, we use the old way to
|
||||
* handle discard bio
|
||||
*/
|
||||
static int raid10_handle_discard(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct r10conf *conf = mddev->private;
|
||||
struct geom *geo = &conf->geo;
|
||||
struct r10bio *r10_bio, *first_r10bio;
|
||||
int far_copies = geo->far_copies;
|
||||
bool first_copy = true;
|
||||
|
||||
int disk;
|
||||
sector_t chunk;
|
||||
unsigned int stripe_size;
|
||||
sector_t split_size;
|
||||
|
||||
sector_t bio_start, bio_end;
|
||||
sector_t first_stripe_index, last_stripe_index;
|
||||
sector_t start_disk_offset;
|
||||
unsigned int start_disk_index;
|
||||
sector_t end_disk_offset;
|
||||
unsigned int end_disk_index;
|
||||
unsigned int remainder;
|
||||
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
|
||||
return -EAGAIN;
|
||||
|
||||
wait_barrier(conf);
|
||||
|
||||
/* Check reshape again to avoid reshape happens after checking
|
||||
* MD_RECOVERY_RESHAPE and before wait_barrier
|
||||
*/
|
||||
if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
|
||||
goto out;
|
||||
|
||||
stripe_size = geo->raid_disks << geo->chunk_shift;
|
||||
bio_start = bio->bi_iter.bi_sector;
|
||||
bio_end = bio_end_sector(bio);
|
||||
|
||||
/* Maybe one discard bio is smaller than strip size or across one stripe
|
||||
* and discard region is larger than one stripe size. For far offset layout,
|
||||
* if the discard region is not aligned with stripe size, there is hole
|
||||
* when we submit discard bio to member disk. For simplicity, we only
|
||||
* handle discard bio which discard region is bigger than stripe_size*2
|
||||
*/
|
||||
if (bio_sectors(bio) < stripe_size*2)
|
||||
goto out;
|
||||
|
||||
/* For far and far offset layout, if bio is not aligned with stripe size,
|
||||
* it splits the part that is not aligned with strip size.
|
||||
*/
|
||||
div_u64_rem(bio_start, stripe_size, &remainder);
|
||||
if ((far_copies > 1) && remainder) {
|
||||
split_size = stripe_size - remainder;
|
||||
bio = raid10_split_bio(conf, bio, split_size, false);
|
||||
}
|
||||
div_u64_rem(bio_end, stripe_size, &remainder);
|
||||
if ((far_copies > 1) && remainder) {
|
||||
split_size = bio_sectors(bio) - remainder;
|
||||
bio = raid10_split_bio(conf, bio, split_size, true);
|
||||
}
|
||||
|
||||
bio_start = bio->bi_iter.bi_sector;
|
||||
bio_end = bio_end_sector(bio);
|
||||
|
||||
/* raid10 uses chunk as the unit to store data. It's similar like raid0.
|
||||
* One stripe contains the chunks from all member disk (one chunk from
|
||||
* one disk at the same HBA address). For layout detail, see 'man md 4'
|
||||
*/
|
||||
chunk = bio_start >> geo->chunk_shift;
|
||||
chunk *= geo->near_copies;
|
||||
first_stripe_index = chunk;
|
||||
start_disk_index = sector_div(first_stripe_index, geo->raid_disks);
|
||||
if (geo->far_offset)
|
||||
first_stripe_index *= geo->far_copies;
|
||||
start_disk_offset = (bio_start & geo->chunk_mask) +
|
||||
(first_stripe_index << geo->chunk_shift);
|
||||
|
||||
chunk = bio_end >> geo->chunk_shift;
|
||||
chunk *= geo->near_copies;
|
||||
last_stripe_index = chunk;
|
||||
end_disk_index = sector_div(last_stripe_index, geo->raid_disks);
|
||||
if (geo->far_offset)
|
||||
last_stripe_index *= geo->far_copies;
|
||||
end_disk_offset = (bio_end & geo->chunk_mask) +
|
||||
(last_stripe_index << geo->chunk_shift);
|
||||
|
||||
retry_discard:
|
||||
r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO);
|
||||
r10_bio->mddev = mddev;
|
||||
r10_bio->state = 0;
|
||||
r10_bio->sectors = 0;
|
||||
memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks);
|
||||
wait_blocked_dev(mddev, r10_bio);
|
||||
|
||||
/* For far layout it needs more than one r10bio to cover all regions.
|
||||
* Inspired by raid10_sync_request, we can use the first r10bio->master_bio
|
||||
* to record the discard bio. Other r10bio->master_bio record the first
|
||||
* r10bio. The first r10bio only release after all other r10bios finish.
|
||||
* The discard bio returns only first r10bio finishes
|
||||
*/
|
||||
if (first_copy) {
|
||||
r10_bio->master_bio = bio;
|
||||
set_bit(R10BIO_Discard, &r10_bio->state);
|
||||
first_copy = false;
|
||||
first_r10bio = r10_bio;
|
||||
} else
|
||||
r10_bio->master_bio = (struct bio *)first_r10bio;
|
||||
|
||||
rcu_read_lock();
|
||||
for (disk = 0; disk < geo->raid_disks; disk++) {
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
|
||||
struct md_rdev *rrdev = rcu_dereference(
|
||||
conf->mirrors[disk].replacement);
|
||||
|
||||
r10_bio->devs[disk].bio = NULL;
|
||||
r10_bio->devs[disk].repl_bio = NULL;
|
||||
|
||||
if (rdev && (test_bit(Faulty, &rdev->flags)))
|
||||
rdev = NULL;
|
||||
if (rrdev && (test_bit(Faulty, &rrdev->flags)))
|
||||
rrdev = NULL;
|
||||
if (!rdev && !rrdev)
|
||||
continue;
|
||||
|
||||
if (rdev) {
|
||||
r10_bio->devs[disk].bio = bio;
|
||||
atomic_inc(&rdev->nr_pending);
|
||||
}
|
||||
if (rrdev) {
|
||||
r10_bio->devs[disk].repl_bio = bio;
|
||||
atomic_inc(&rrdev->nr_pending);
|
||||
}
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
atomic_set(&r10_bio->remaining, 1);
|
||||
for (disk = 0; disk < geo->raid_disks; disk++) {
|
||||
sector_t dev_start, dev_end;
|
||||
struct bio *mbio, *rbio = NULL;
|
||||
struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev);
|
||||
struct md_rdev *rrdev = rcu_dereference(
|
||||
conf->mirrors[disk].replacement);
|
||||
|
||||
/*
|
||||
* Now start to calculate the start and end address for each disk.
|
||||
* The space between dev_start and dev_end is the discard region.
|
||||
*
|
||||
* For dev_start, it needs to consider three conditions:
|
||||
* 1st, the disk is before start_disk, you can imagine the disk in
|
||||
* the next stripe. So the dev_start is the start address of next
|
||||
* stripe.
|
||||
* 2st, the disk is after start_disk, it means the disk is at the
|
||||
* same stripe of first disk
|
||||
* 3st, the first disk itself, we can use start_disk_offset directly
|
||||
*/
|
||||
if (disk < start_disk_index)
|
||||
dev_start = (first_stripe_index + 1) * mddev->chunk_sectors;
|
||||
else if (disk > start_disk_index)
|
||||
dev_start = first_stripe_index * mddev->chunk_sectors;
|
||||
else
|
||||
dev_start = start_disk_offset;
|
||||
|
||||
if (disk < end_disk_index)
|
||||
dev_end = (last_stripe_index + 1) * mddev->chunk_sectors;
|
||||
else if (disk > end_disk_index)
|
||||
dev_end = last_stripe_index * mddev->chunk_sectors;
|
||||
else
|
||||
dev_end = end_disk_offset;
|
||||
|
||||
/* It only handles discard bio which size is >= stripe size, so
|
||||
* dev_end > dev_start all the time
|
||||
*/
|
||||
if (r10_bio->devs[disk].bio) {
|
||||
mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
|
||||
mbio->bi_end_io = raid10_end_discard_request;
|
||||
mbio->bi_private = r10_bio;
|
||||
r10_bio->devs[disk].bio = mbio;
|
||||
r10_bio->devs[disk].devnum = disk;
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
md_submit_discard_bio(mddev, rdev, mbio,
|
||||
dev_start + choose_data_offset(r10_bio, rdev),
|
||||
dev_end - dev_start);
|
||||
bio_endio(mbio);
|
||||
}
|
||||
if (r10_bio->devs[disk].repl_bio) {
|
||||
rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set);
|
||||
rbio->bi_end_io = raid10_end_discard_request;
|
||||
rbio->bi_private = r10_bio;
|
||||
r10_bio->devs[disk].repl_bio = rbio;
|
||||
r10_bio->devs[disk].devnum = disk;
|
||||
atomic_inc(&r10_bio->remaining);
|
||||
md_submit_discard_bio(mddev, rrdev, rbio,
|
||||
dev_start + choose_data_offset(r10_bio, rrdev),
|
||||
dev_end - dev_start);
|
||||
bio_endio(rbio);
|
||||
}
|
||||
}
|
||||
|
||||
if (!geo->far_offset && --far_copies) {
|
||||
first_stripe_index += geo->stride >> geo->chunk_shift;
|
||||
start_disk_offset += geo->stride;
|
||||
last_stripe_index += geo->stride >> geo->chunk_shift;
|
||||
end_disk_offset += geo->stride;
|
||||
atomic_inc(&first_r10bio->remaining);
|
||||
raid_end_discard_bio(r10_bio);
|
||||
wait_barrier(conf);
|
||||
goto retry_discard;
|
||||
}
|
||||
|
||||
raid_end_discard_bio(r10_bio);
|
||||
|
||||
return 0;
|
||||
out:
|
||||
allow_barrier(conf);
|
||||
return -EAGAIN;
|
||||
}
|
||||
|
||||
static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
|
||||
{
|
||||
struct r10conf *conf = mddev->private;
|
||||
|
|
@ -1820,10 +1515,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio)
|
|||
if (!md_write_start(mddev, bio))
|
||||
return false;
|
||||
|
||||
if (unlikely(bio_op(bio) == REQ_OP_DISCARD))
|
||||
if (!raid10_handle_discard(mddev, bio))
|
||||
return true;
|
||||
|
||||
/*
|
||||
* If this request crosses a chunk boundary, we need to split
|
||||
* it.
|
||||
|
|
@ -4063,7 +3754,7 @@ static int raid10_run(struct mddev *mddev)
|
|||
|
||||
if (mddev->queue) {
|
||||
blk_queue_max_discard_sectors(mddev->queue,
|
||||
UINT_MAX);
|
||||
mddev->chunk_sectors);
|
||||
blk_queue_max_write_same_sectors(mddev->queue, 0);
|
||||
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
|
||||
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
|
||||
|
|
|
|||
|
|
@ -179,6 +179,5 @@ enum r10bio_state {
|
|||
R10BIO_Previous,
|
||||
/* failfast devices did receive failfast requests. */
|
||||
R10BIO_FailFast,
|
||||
R10BIO_Discard,
|
||||
};
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -38,8 +38,33 @@ static void ipi_mb(void *info)
|
|||
smp_mb(); /* IPIs should be serializing but paranoid. */
|
||||
}
|
||||
|
||||
static void ipi_sync_core(void *info)
|
||||
{
|
||||
/*
|
||||
* The smp_mb() in membarrier after all the IPIs is supposed to
|
||||
* ensure that memory on remote CPUs that occur before the IPI
|
||||
* become visible to membarrier()'s caller -- see scenario B in
|
||||
* the big comment at the top of this file.
|
||||
*
|
||||
* A sync_core() would provide this guarantee, but
|
||||
* sync_core_before_usermode() might end up being deferred until
|
||||
* after membarrier()'s smp_mb().
|
||||
*/
|
||||
smp_mb(); /* IPIs should be serializing but paranoid. */
|
||||
|
||||
sync_core_before_usermode();
|
||||
}
|
||||
|
||||
static void ipi_rseq(void *info)
|
||||
{
|
||||
/*
|
||||
* Ensure that all stores done by the calling thread are visible
|
||||
* to the current task before the current task resumes. We could
|
||||
* probably optimize this away on most architectures, but by the
|
||||
* time we've already sent an IPI, the cost of the extra smp_mb()
|
||||
* is negligible.
|
||||
*/
|
||||
smp_mb();
|
||||
rseq_preempt(current);
|
||||
}
|
||||
|
||||
|
|
@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id)
|
|||
if (!(atomic_read(&mm->membarrier_state) &
|
||||
MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY))
|
||||
return -EPERM;
|
||||
ipi_func = ipi_sync_core;
|
||||
} else if (flags == MEMBARRIER_FLAG_RSEQ) {
|
||||
if (!IS_ENABLED(CONFIG_RSEQ))
|
||||
return -EINVAL;
|
||||
|
|
@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id)
|
|||
return -EPERM;
|
||||
}
|
||||
|
||||
if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)
|
||||
if (flags != MEMBARRIER_FLAG_SYNC_CORE &&
|
||||
(atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
|
|
@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
|
|||
|
||||
if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id))
|
||||
goto out;
|
||||
if (cpu_id == raw_smp_processor_id())
|
||||
goto out;
|
||||
rcu_read_lock();
|
||||
p = rcu_dereference(cpu_rq(cpu_id)->curr);
|
||||
if (!p || p->mm != mm) {
|
||||
|
|
@ -203,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id)
|
|||
for_each_online_cpu(cpu) {
|
||||
struct task_struct *p;
|
||||
|
||||
/*
|
||||
* Skipping the current CPU is OK even through we can be
|
||||
* migrated at any point. The current CPU, at the point
|
||||
* where we read raw_smp_processor_id(), is ensured to
|
||||
* be in program order with respect to the caller
|
||||
* thread. Therefore, we can skip this CPU from the
|
||||
* iteration.
|
||||
*/
|
||||
if (cpu == raw_smp_processor_id())
|
||||
continue;
|
||||
p = rcu_dereference(cpu_rq(cpu)->curr);
|
||||
if (p && p->mm == mm)
|
||||
__cpumask_set_cpu(cpu, tmpmask);
|
||||
|
|
@ -220,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id)
|
|||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
preempt_disable();
|
||||
if (cpu_id >= 0)
|
||||
if (cpu_id >= 0) {
|
||||
/*
|
||||
* smp_call_function_single() will call ipi_func() if cpu_id
|
||||
* is the calling CPU.
|
||||
*/
|
||||
smp_call_function_single(cpu_id, ipi_func, NULL, 1);
|
||||
else
|
||||
smp_call_function_many(tmpmask, ipi_func, NULL, 1);
|
||||
preempt_enable();
|
||||
} else {
|
||||
/*
|
||||
* For regular membarrier, we can save a few cycles by
|
||||
* skipping the current cpu -- we're about to do smp_mb()
|
||||
* below, and if we migrate to a different cpu, this cpu
|
||||
* and the new cpu will execute a full barrier in the
|
||||
* scheduler.
|
||||
*
|
||||
* For SYNC_CORE, we do need a barrier on the current cpu --
|
||||
* otherwise, if we are migrated and replaced by a different
|
||||
* task in the same mm just before, during, or after
|
||||
* membarrier, we will end up with some thread in the mm
|
||||
* running without a core sync.
|
||||
*
|
||||
* For RSEQ, don't rseq_preempt() the caller. User code
|
||||
* is not supposed to issue syscalls at all from inside an
|
||||
* rseq critical section.
|
||||
*/
|
||||
if (flags != MEMBARRIER_FLAG_SYNC_CORE) {
|
||||
preempt_disable();
|
||||
smp_call_function_many(tmpmask, ipi_func, NULL, true);
|
||||
preempt_enable();
|
||||
} else {
|
||||
on_each_cpu_mask(tmpmask, ipi_func, NULL, true);
|
||||
}
|
||||
}
|
||||
|
||||
out:
|
||||
if (cpu_id < 0)
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user